mirror of
https://github.com/tencentmusic/supersonic.git
synced 2025-12-13 21:17:08 +00:00
first commit
This commit is contained in:
@@ -0,0 +1,319 @@
|
||||
package com.hankcs.hanlp.collection.trie.bintrie;
|
||||
|
||||
import com.hankcs.hanlp.corpus.io.ByteArray;
|
||||
import java.io.DataOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.ObjectInput;
|
||||
import java.io.ObjectOutput;
|
||||
import java.util.AbstractMap;
|
||||
import java.util.ArrayDeque;
|
||||
import java.util.Arrays;
|
||||
import java.util.Map;
|
||||
import java.util.Queue;
|
||||
import java.util.Set;
|
||||
|
||||
|
||||
public abstract class BaseNode<V> implements Comparable<BaseNode> {
|
||||
|
||||
/**
|
||||
* 状态数组,方便读取的时候用
|
||||
*/
|
||||
static final Status[] ARRAY_STATUS = Status.values();
|
||||
/**
|
||||
* 子节点
|
||||
*/
|
||||
protected BaseNode[] child;
|
||||
/**
|
||||
* 节点状态
|
||||
*/
|
||||
protected Status status;
|
||||
/**
|
||||
* 节点代表的字符
|
||||
*/
|
||||
protected char c;
|
||||
/**
|
||||
* 节点代表的值
|
||||
*/
|
||||
protected V value;
|
||||
|
||||
public String prefix = null;
|
||||
|
||||
public BaseNode<V> transition(String path, int begin) {
|
||||
BaseNode<V> cur = this;
|
||||
for (int i = begin; i < path.length(); ++i) {
|
||||
cur = cur.getChild(path.charAt(i));
|
||||
if (cur == null || cur.status == Status.UNDEFINED_0) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
return cur;
|
||||
}
|
||||
|
||||
public BaseNode<V> transition(char[] path, int begin) {
|
||||
BaseNode<V> cur = this;
|
||||
for (int i = begin; i < path.length; ++i) {
|
||||
cur = cur.getChild(path[i]);
|
||||
if (cur == null || cur.status == Status.UNDEFINED_0) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
return cur;
|
||||
}
|
||||
|
||||
/**
|
||||
* 转移状态
|
||||
*
|
||||
* @param path
|
||||
* @return
|
||||
*/
|
||||
public BaseNode<V> transition(char path) {
|
||||
BaseNode<V> cur = this;
|
||||
cur = cur.getChild(path);
|
||||
if (cur == null || cur.status == Status.UNDEFINED_0) {
|
||||
return null;
|
||||
}
|
||||
return cur;
|
||||
}
|
||||
|
||||
/**
|
||||
* 添加子节点
|
||||
*
|
||||
* @return true-新增了节点 false-修改了现有节点
|
||||
*/
|
||||
protected abstract boolean addChild(BaseNode node);
|
||||
|
||||
/**
|
||||
* 是否含有子节点
|
||||
*
|
||||
* @param c 子节点的char
|
||||
* @return 是否含有
|
||||
*/
|
||||
protected boolean hasChild(char c) {
|
||||
return getChild(c) != null;
|
||||
}
|
||||
|
||||
protected char getChar() {
|
||||
return c;
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取子节点
|
||||
*
|
||||
* @param c 子节点的char
|
||||
* @return 子节点
|
||||
*/
|
||||
public abstract BaseNode getChild(char c);
|
||||
|
||||
/**
|
||||
* 获取节点对应的值
|
||||
*
|
||||
* @return 值
|
||||
*/
|
||||
public final V getValue() {
|
||||
return value;
|
||||
}
|
||||
|
||||
/**
|
||||
* 设置节点对应的值
|
||||
*
|
||||
* @param value 值
|
||||
*/
|
||||
public final void setValue(V value) {
|
||||
this.value = value;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int compareTo(BaseNode other) {
|
||||
return compareTo(other.getChar());
|
||||
}
|
||||
|
||||
/**
|
||||
* 重载,与字符的比较
|
||||
*
|
||||
* @param other
|
||||
* @return
|
||||
*/
|
||||
public int compareTo(char other) {
|
||||
if (this.c > other) {
|
||||
return 1;
|
||||
}
|
||||
if (this.c < other) {
|
||||
return -1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取节点的成词状态
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public Status getStatus() {
|
||||
return status;
|
||||
}
|
||||
|
||||
protected void walk(StringBuilder sb, Set<Map.Entry<String, V>> entrySet) {
|
||||
sb.append(c);
|
||||
if (status == Status.WORD_MIDDLE_2 || status == Status.WORD_END_3) {
|
||||
entrySet.add(new TrieEntry(sb.toString(), value));
|
||||
}
|
||||
if (child == null) {
|
||||
return;
|
||||
}
|
||||
for (BaseNode node : child) {
|
||||
if (node == null) {
|
||||
continue;
|
||||
}
|
||||
node.walk(new StringBuilder(sb.toString()), entrySet);
|
||||
}
|
||||
}
|
||||
|
||||
protected void walkToSave(DataOutputStream out) throws IOException {
|
||||
out.writeChar(c);
|
||||
out.writeInt(status.ordinal());
|
||||
int childSize = 0;
|
||||
if (child != null) {
|
||||
childSize = child.length;
|
||||
}
|
||||
out.writeInt(childSize);
|
||||
if (child == null) {
|
||||
return;
|
||||
}
|
||||
for (BaseNode node : child) {
|
||||
node.walkToSave(out);
|
||||
}
|
||||
}
|
||||
|
||||
protected void walkToSave(ObjectOutput out) throws IOException {
|
||||
out.writeChar(c);
|
||||
out.writeInt(status.ordinal());
|
||||
if (status == Status.WORD_END_3 || status == Status.WORD_MIDDLE_2) {
|
||||
out.writeObject(value);
|
||||
}
|
||||
int childSize = 0;
|
||||
if (child != null) {
|
||||
childSize = child.length;
|
||||
}
|
||||
out.writeInt(childSize);
|
||||
if (child == null) {
|
||||
return;
|
||||
}
|
||||
for (BaseNode node : child) {
|
||||
node.walkToSave(out);
|
||||
}
|
||||
}
|
||||
|
||||
protected void walkToLoad(ByteArray byteArray, _ValueArray<V> valueArray) {
|
||||
c = byteArray.nextChar();
|
||||
status = ARRAY_STATUS[byteArray.nextInt()];
|
||||
if (status == Status.WORD_END_3 || status == Status.WORD_MIDDLE_2) {
|
||||
value = valueArray.nextValue();
|
||||
}
|
||||
int childSize = byteArray.nextInt();
|
||||
child = new BaseNode[childSize];
|
||||
for (int i = 0; i < childSize; ++i) {
|
||||
child[i] = new Node<V>();
|
||||
child[i].walkToLoad(byteArray, valueArray);
|
||||
}
|
||||
}
|
||||
|
||||
protected void walkToLoad(ObjectInput byteArray) throws IOException, ClassNotFoundException {
|
||||
c = byteArray.readChar();
|
||||
status = ARRAY_STATUS[byteArray.readInt()];
|
||||
if (status == Status.WORD_END_3 || status == Status.WORD_MIDDLE_2) {
|
||||
value = (V) byteArray.readObject();
|
||||
}
|
||||
int childSize = byteArray.readInt();
|
||||
child = new BaseNode[childSize];
|
||||
for (int i = 0; i < childSize; ++i) {
|
||||
child[i] = new Node<V>();
|
||||
child[i].walkToLoad(byteArray);
|
||||
}
|
||||
}
|
||||
|
||||
public enum Status {
|
||||
/**
|
||||
* 未指定,用于删除词条
|
||||
*/
|
||||
UNDEFINED_0,
|
||||
/**
|
||||
* 不是词语的结尾
|
||||
*/
|
||||
NOT_WORD_1,
|
||||
/**
|
||||
* 是个词语的结尾,并且还可以继续
|
||||
*/
|
||||
WORD_MIDDLE_2,
|
||||
/**
|
||||
* 是个词语的结尾,并且没有继续
|
||||
*/
|
||||
WORD_END_3,
|
||||
}
|
||||
|
||||
public class TrieEntry extends AbstractMap.SimpleEntry<String, V> implements Comparable<TrieEntry> {
|
||||
|
||||
public TrieEntry(String key, V value) {
|
||||
super(key, value);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int compareTo(TrieEntry o) {
|
||||
return getKey().compareTo(String.valueOf(o.getKey()));
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "BaseNode{"
|
||||
+ "child="
|
||||
+ Arrays.toString(child)
|
||||
+ ", status="
|
||||
+ status
|
||||
+ ", c="
|
||||
+ c
|
||||
+ ", value="
|
||||
+ value
|
||||
+ ", prefix='"
|
||||
+ prefix
|
||||
+ '\''
|
||||
+ '}';
|
||||
}
|
||||
|
||||
public void walkNode(Set<Map.Entry<String, V>> entrySet) {
|
||||
if (status == Status.WORD_MIDDLE_2 || status == Status.WORD_END_3) {
|
||||
String name = this.prefix != null ? this.prefix + c : "" + c;
|
||||
entrySet.add(new TrieEntry(name, value));
|
||||
}
|
||||
}
|
||||
|
||||
/***
|
||||
* walk limit
|
||||
* @param sb
|
||||
* @param entrySet
|
||||
* @param limit
|
||||
*/
|
||||
public void walkLimit(StringBuilder sb, Set<Map.Entry<String, V>> entrySet, int limit) {
|
||||
Queue<BaseNode> queue = new ArrayDeque<>();
|
||||
this.prefix = sb.toString();
|
||||
queue.add(this);
|
||||
while (!queue.isEmpty()) {
|
||||
if (entrySet.size() >= limit) {
|
||||
break;
|
||||
}
|
||||
BaseNode root = queue.poll();
|
||||
if (root == null) {
|
||||
continue;
|
||||
}
|
||||
root.walkNode(entrySet);
|
||||
if (root.child == null) {
|
||||
continue;
|
||||
}
|
||||
String prefix = root.prefix + root.c;
|
||||
for (BaseNode node : root.child) {
|
||||
node.prefix = prefix;
|
||||
queue.add(node);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,393 @@
|
||||
package com.hankcs.hanlp.dictionary;
|
||||
|
||||
|
||||
import static com.hankcs.hanlp.utility.Predefine.logger;
|
||||
|
||||
import com.hankcs.hanlp.HanLP;
|
||||
import com.hankcs.hanlp.collection.trie.DoubleArrayTrie;
|
||||
import com.hankcs.hanlp.corpus.io.ByteArray;
|
||||
import com.hankcs.hanlp.corpus.io.IOUtil;
|
||||
import com.hankcs.hanlp.corpus.tag.Nature;
|
||||
import com.hankcs.hanlp.utility.Predefine;
|
||||
import com.hankcs.hanlp.utility.TextUtility;
|
||||
|
||||
import java.io.BufferedOutputStream;
|
||||
import java.io.BufferedReader;
|
||||
import java.io.DataOutputStream;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.Serializable;
|
||||
import java.util.Collection;
|
||||
import java.util.TreeMap;
|
||||
|
||||
/**
|
||||
* 使用DoubleArrayTrie实现的核心词典
|
||||
*/
|
||||
public class CoreDictionary {
|
||||
|
||||
public static DoubleArrayTrie<Attribute> trie = new DoubleArrayTrie<Attribute>();
|
||||
|
||||
public static final String PATH = HanLP.Config.CoreDictionaryPath;
|
||||
|
||||
// 自动加载词典
|
||||
static {
|
||||
long start = System.currentTimeMillis();
|
||||
if (!load(PATH)) {
|
||||
throw new IllegalArgumentException("核心词典" + PATH + "加载失败");
|
||||
} else {
|
||||
logger.info(PATH + "加载成功," + trie.size() + "个词条,耗时" + (System.currentTimeMillis() - start) + "ms");
|
||||
}
|
||||
}
|
||||
|
||||
// 一些特殊的WORD_ID
|
||||
public static final int NR_WORD_ID = getWordID(Predefine.TAG_PEOPLE);
|
||||
public static final int NS_WORD_ID = getWordID(Predefine.TAG_PLACE);
|
||||
public static final int NT_WORD_ID = getWordID(Predefine.TAG_GROUP);
|
||||
public static final int T_WORD_ID = getWordID(Predefine.TAG_TIME);
|
||||
public static final int X_WORD_ID = getWordID(Predefine.TAG_CLUSTER);
|
||||
public static final int M_WORD_ID = getWordID(Predefine.TAG_NUMBER);
|
||||
public static final int NX_WORD_ID = getWordID(Predefine.TAG_PROPER);
|
||||
|
||||
private static boolean load(String path) {
|
||||
logger.info("核心词典开始加载:" + path);
|
||||
if (loadDat(path)) {
|
||||
return true;
|
||||
}
|
||||
TreeMap<String, Attribute> map = new TreeMap<String, Attribute>();
|
||||
BufferedReader br = null;
|
||||
try {
|
||||
br = new BufferedReader(new InputStreamReader(IOUtil.newInputStream(path), "UTF-8"));
|
||||
String line;
|
||||
int totalFrequency = 0;
|
||||
long start = System.currentTimeMillis();
|
||||
while ((line = br.readLine()) != null) {
|
||||
String[] param = line.split("\\s");
|
||||
int natureCount = (param.length - 1) / 2;
|
||||
Attribute attribute = new Attribute(natureCount);
|
||||
for (int i = 0; i < natureCount; ++i) {
|
||||
attribute.nature[i] = Nature.create(param[1 + 2 * i]);
|
||||
attribute.frequency[i] = Integer.parseInt(param[2 + 2 * i]);
|
||||
attribute.totalFrequency += attribute.frequency[i];
|
||||
}
|
||||
map.put(param[0], attribute);
|
||||
totalFrequency += attribute.totalFrequency;
|
||||
}
|
||||
logger.info(
|
||||
"核心词典读入词条" + map.size() + " 全部频次" + totalFrequency + ",耗时" + (System.currentTimeMillis() - start)
|
||||
+ "ms");
|
||||
br.close();
|
||||
trie.build(map);
|
||||
logger.info("核心词典加载成功:" + trie.size() + "个词条,下面将写入缓存……");
|
||||
try {
|
||||
DataOutputStream out = new DataOutputStream(
|
||||
new BufferedOutputStream(IOUtil.newOutputStream(path + Predefine.BIN_EXT)));
|
||||
Collection<Attribute> attributeList = map.values();
|
||||
out.writeInt(attributeList.size());
|
||||
for (Attribute attribute : attributeList) {
|
||||
out.writeInt(attribute.totalFrequency);
|
||||
out.writeInt(attribute.nature.length);
|
||||
for (int i = 0; i < attribute.nature.length; ++i) {
|
||||
out.writeInt(attribute.nature[i].ordinal());
|
||||
out.writeInt(attribute.frequency[i]);
|
||||
}
|
||||
}
|
||||
trie.save(out);
|
||||
out.writeInt(totalFrequency);
|
||||
Predefine.setTotalFrequency(totalFrequency);
|
||||
out.close();
|
||||
} catch (Exception e) {
|
||||
logger.warning("保存失败" + e);
|
||||
return false;
|
||||
}
|
||||
} catch (FileNotFoundException e) {
|
||||
logger.warning("核心词典" + path + "不存在!" + e);
|
||||
return false;
|
||||
} catch (IOException e) {
|
||||
logger.warning("核心词典" + path + "读取错误!" + e);
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* 从磁盘加载双数组
|
||||
*
|
||||
* @param path
|
||||
* @return
|
||||
*/
|
||||
static boolean loadDat(String path) {
|
||||
try {
|
||||
ByteArray byteArray = ByteArray.createByteArray(path + Predefine.BIN_EXT);
|
||||
if (byteArray == null) {
|
||||
return false;
|
||||
}
|
||||
int size = byteArray.nextInt();
|
||||
Attribute[] attributes = new Attribute[size];
|
||||
final Nature[] natureIndexArray = Nature.values();
|
||||
for (int i = 0; i < size; ++i) {
|
||||
// 第一个是全部频次,第二个是词性个数
|
||||
int currentTotalFrequency = byteArray.nextInt();
|
||||
int length = byteArray.nextInt();
|
||||
attributes[i] = new Attribute(length);
|
||||
attributes[i].totalFrequency = currentTotalFrequency;
|
||||
for (int j = 0; j < length; ++j) {
|
||||
attributes[i].nature[j] = natureIndexArray[byteArray.nextInt()];
|
||||
attributes[i].frequency[j] = byteArray.nextInt();
|
||||
}
|
||||
}
|
||||
if (!trie.load(byteArray, attributes)) {
|
||||
return false;
|
||||
}
|
||||
int totalFrequency = 0;
|
||||
if (byteArray.hasMore()) {
|
||||
totalFrequency = byteArray.nextInt();
|
||||
} else {
|
||||
for (Attribute attribute : attributes) {
|
||||
totalFrequency += attribute.totalFrequency;
|
||||
}
|
||||
}
|
||||
Predefine.setTotalFrequency(totalFrequency);
|
||||
} catch (Exception e) {
|
||||
logger.warning("读取失败,问题发生在" + e);
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取条目
|
||||
*
|
||||
* @param key
|
||||
* @return
|
||||
*/
|
||||
public static Attribute get(String key) {
|
||||
return trie.get(key);
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取条目
|
||||
*
|
||||
* @param wordID
|
||||
* @return
|
||||
*/
|
||||
public static Attribute get(int wordID) {
|
||||
return trie.get(wordID);
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取词频
|
||||
*
|
||||
* @param term
|
||||
* @return
|
||||
*/
|
||||
public static int getTermFrequency(String term) {
|
||||
Attribute attribute = get(term);
|
||||
if (attribute == null) {
|
||||
return 0;
|
||||
}
|
||||
return attribute.totalFrequency;
|
||||
}
|
||||
|
||||
/**
|
||||
* 是否包含词语
|
||||
*
|
||||
* @param key
|
||||
* @return
|
||||
*/
|
||||
public static boolean contains(String key) {
|
||||
return trie.get(key) != null;
|
||||
}
|
||||
|
||||
/**
|
||||
* 核心词典中的词属性
|
||||
*/
|
||||
public static class Attribute implements Serializable {
|
||||
|
||||
/**
|
||||
* 词性列表
|
||||
*/
|
||||
public Nature[] nature;
|
||||
/**
|
||||
* 词性对应的词频
|
||||
*/
|
||||
public int[] frequency;
|
||||
|
||||
public int totalFrequency;
|
||||
public String original = null;
|
||||
|
||||
|
||||
public Attribute(int size) {
|
||||
nature = new Nature[size];
|
||||
frequency = new int[size];
|
||||
}
|
||||
|
||||
public Attribute(Nature[] nature, int[] frequency) {
|
||||
this.nature = nature;
|
||||
this.frequency = frequency;
|
||||
}
|
||||
|
||||
public Attribute(Nature nature, int frequency) {
|
||||
this(1);
|
||||
this.nature[0] = nature;
|
||||
this.frequency[0] = frequency;
|
||||
totalFrequency = frequency;
|
||||
}
|
||||
|
||||
public Attribute(Nature[] nature, int[] frequency, int totalFrequency) {
|
||||
this.nature = nature;
|
||||
this.frequency = frequency;
|
||||
this.totalFrequency = totalFrequency;
|
||||
}
|
||||
|
||||
/**
|
||||
* 使用单个词性,默认词频1000构造
|
||||
*
|
||||
* @param nature
|
||||
*/
|
||||
public Attribute(Nature nature) {
|
||||
this(nature, 1000);
|
||||
}
|
||||
|
||||
public static Attribute create(String natureWithFrequency) {
|
||||
try {
|
||||
String[] param = natureWithFrequency.split(" ");
|
||||
if (param.length % 2 != 0) {
|
||||
return new Attribute(Nature.create(natureWithFrequency.trim()), 1); // 儿童锁
|
||||
}
|
||||
int natureCount = param.length / 2;
|
||||
Attribute attribute = new Attribute(natureCount);
|
||||
for (int i = 0; i < natureCount; ++i) {
|
||||
attribute.nature[i] = Nature.create(param[2 * i]);
|
||||
attribute.frequency[i] = Integer.parseInt(param[1 + 2 * i]);
|
||||
attribute.totalFrequency += attribute.frequency[i];
|
||||
}
|
||||
return attribute;
|
||||
} catch (Exception e) {
|
||||
logger.warning("使用字符串" + natureWithFrequency + "创建词条属性失败!" + TextUtility.exceptionToString(e));
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 从字节流中加载
|
||||
*
|
||||
* @param byteArray
|
||||
* @param natureIndexArray
|
||||
* @return
|
||||
*/
|
||||
public static Attribute create(ByteArray byteArray, Nature[] natureIndexArray) {
|
||||
int currentTotalFrequency = byteArray.nextInt();
|
||||
int length = byteArray.nextInt();
|
||||
Attribute attribute = new Attribute(length);
|
||||
attribute.totalFrequency = currentTotalFrequency;
|
||||
for (int j = 0; j < length; ++j) {
|
||||
attribute.nature[j] = natureIndexArray[byteArray.nextInt()];
|
||||
attribute.frequency[j] = byteArray.nextInt();
|
||||
}
|
||||
|
||||
return attribute;
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取词性的词频
|
||||
*
|
||||
* @param nature 字符串词性
|
||||
* @return 词频
|
||||
* @deprecated 推荐使用Nature参数!
|
||||
*/
|
||||
public int getNatureFrequency(String nature) {
|
||||
try {
|
||||
Nature pos = Nature.create(nature);
|
||||
return getNatureFrequency(pos);
|
||||
} catch (IllegalArgumentException e) {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取词性的词频
|
||||
*
|
||||
* @param nature 词性
|
||||
* @return 词频
|
||||
*/
|
||||
public int getNatureFrequency(final Nature nature) {
|
||||
int i = 0;
|
||||
for (Nature pos : this.nature) {
|
||||
if (nature == pos) {
|
||||
return frequency[i];
|
||||
}
|
||||
++i;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* 是否有某个词性
|
||||
*
|
||||
* @param nature
|
||||
* @return
|
||||
*/
|
||||
public boolean hasNature(Nature nature) {
|
||||
return getNatureFrequency(nature) > 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* 是否有以某个前缀开头的词性
|
||||
*
|
||||
* @param prefix 词性前缀,比如u会查询是否有ude, uzhe等等
|
||||
* @return
|
||||
*/
|
||||
public boolean hasNatureStartsWith(String prefix) {
|
||||
for (Nature n : nature) {
|
||||
if (n.startsWith(prefix)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
final StringBuilder sb = new StringBuilder();
|
||||
for (int i = 0; i < nature.length; ++i) {
|
||||
sb.append(nature[i]).append(' ').append(frequency[i]).append(' ');
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
public void save(DataOutputStream out) throws IOException {
|
||||
out.writeInt(totalFrequency);
|
||||
out.writeInt(nature.length);
|
||||
for (int i = 0; i < nature.length; ++i) {
|
||||
out.writeInt(nature[i].ordinal());
|
||||
out.writeInt(frequency[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取词语的ID
|
||||
*
|
||||
* @param a 词语
|
||||
* @return ID, 如果不存在, 则返回-1
|
||||
*/
|
||||
public static int getWordID(String a) {
|
||||
return CoreDictionary.trie.exactMatchSearch(a);
|
||||
}
|
||||
|
||||
/**
|
||||
* 热更新核心词典<br>
|
||||
* 集群环境(或其他IOAdapter)需要自行删除缓存文件
|
||||
*
|
||||
* @return 是否成功
|
||||
*/
|
||||
public static boolean reload() {
|
||||
String path = CoreDictionary.PATH;
|
||||
IOUtil.deleteFile(path + Predefine.BIN_EXT);
|
||||
|
||||
return load(path);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,69 @@
|
||||
package com.hankcs.hanlp.seg.common;
|
||||
|
||||
import com.hankcs.hanlp.corpus.tag.Nature;
|
||||
import com.hankcs.hanlp.dictionary.CoreDictionary;
|
||||
import com.hankcs.hanlp.dictionary.CustomDictionary;
|
||||
import com.tencent.supersonic.knowledge.infrastructure.nlp.HanlpHelper;
|
||||
import lombok.Data;
|
||||
import lombok.ToString;
|
||||
|
||||
@Data
|
||||
@ToString
|
||||
public class Term {
|
||||
|
||||
public String word;
|
||||
|
||||
public Nature nature;
|
||||
public int offset;
|
||||
public int frequency = 0;
|
||||
|
||||
public Term(String word, Nature nature) {
|
||||
this.word = word;
|
||||
this.nature = nature;
|
||||
}
|
||||
|
||||
public Term(String word, Nature nature, int offset) {
|
||||
this.word = word;
|
||||
this.nature = nature;
|
||||
this.offset = offset;
|
||||
}
|
||||
|
||||
public Term(String word, Nature nature, int offset, int frequency) {
|
||||
this.word = word;
|
||||
this.nature = nature;
|
||||
this.offset = offset;
|
||||
this.frequency = frequency;
|
||||
}
|
||||
|
||||
public int length() {
|
||||
return this.word.length();
|
||||
}
|
||||
|
||||
public int getFrequency() {
|
||||
if (frequency > 0) {
|
||||
return frequency;
|
||||
}
|
||||
String wordOri = word.toLowerCase();
|
||||
CoreDictionary.Attribute attribute = HanlpHelper.getDynamicCustomDictionary().get(wordOri);
|
||||
if (attribute == null) {
|
||||
attribute = CoreDictionary.get(wordOri);
|
||||
if (attribute == null) {
|
||||
attribute = CustomDictionary.get(wordOri);
|
||||
}
|
||||
}
|
||||
if (attribute != null && nature != null && attribute.hasNature(nature)) {
|
||||
return attribute.getNatureFrequency(nature);
|
||||
}
|
||||
return attribute == null ? 0 : attribute.totalFrequency;
|
||||
}
|
||||
|
||||
public boolean equals(Object obj) {
|
||||
if (obj instanceof Term) {
|
||||
Term term = (Term) obj;
|
||||
if (this.nature == term.nature && this.word.equals(term.word)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return super.equals(obj);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,57 @@
|
||||
package com.tencent.supersonic.knowledge.application.online;
|
||||
|
||||
import com.tencent.supersonic.common.nlp.ItemDO;
|
||||
import com.tencent.supersonic.common.nlp.NatureType;
|
||||
import com.tencent.supersonic.common.nlp.WordNature;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
/**
|
||||
* base word nature
|
||||
*/
|
||||
@Slf4j
|
||||
public abstract class BaseWordNature {
|
||||
|
||||
/**
|
||||
* 获取所有wordNature
|
||||
*
|
||||
* @param itemDOS
|
||||
* @return
|
||||
*/
|
||||
public List<WordNature> getWordNatureList(List<ItemDO> itemDOS) {
|
||||
List<WordNature> wordNatures = new ArrayList<>();
|
||||
try {
|
||||
wordNatures = getWordNaturesWithException(itemDOS);
|
||||
} catch (Exception e) {
|
||||
log.error("getWordNatureList error,", e);
|
||||
}
|
||||
return wordNatures;
|
||||
}
|
||||
|
||||
public List<WordNature> getWordNaturesWithException(List<ItemDO> itemDOS) {
|
||||
|
||||
List<WordNature> wordNatures = new ArrayList<>();
|
||||
|
||||
for (ItemDO itemDO : itemDOS) {
|
||||
wordNatures.addAll(getWordNature(itemDO.getName(), itemDO));
|
||||
}
|
||||
return wordNatures;
|
||||
}
|
||||
|
||||
public abstract List<WordNature> getWordNature(String word, ItemDO itemDO);
|
||||
|
||||
public Integer getElementID(String nature) {
|
||||
String[] split = nature.split(NatureType.NATURE_SPILT);
|
||||
if (split.length >= 3) {
|
||||
return Integer.valueOf(split[2]);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
public static Integer getDomain(String nature) {
|
||||
String[] split = nature.split(NatureType.NATURE_SPILT);
|
||||
return Integer.valueOf(split[1]);
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,49 @@
|
||||
package com.tencent.supersonic.knowledge.application.online;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
import com.tencent.supersonic.common.nlp.ItemDO;
|
||||
import com.tencent.supersonic.common.nlp.NatureType;
|
||||
import com.tencent.supersonic.common.nlp.WordNature;
|
||||
import java.util.List;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
/**
|
||||
* dimension word nature
|
||||
*/
|
||||
@Service
|
||||
public class DimensionWordNature extends BaseWordNature {
|
||||
|
||||
@Value("${nlp.dimension.use.suffix:true}")
|
||||
private boolean nlpDimensionUseSuffix = true;
|
||||
|
||||
|
||||
@Override
|
||||
public List<WordNature> getWordNature(String word, ItemDO itemDO) {
|
||||
List<WordNature> result = Lists.newArrayList();
|
||||
result.add(getOnwWordNature(word, itemDO, false));
|
||||
if (nlpDimensionUseSuffix) {
|
||||
String reverseWord = StringUtils.reverse(word);
|
||||
if (StringUtils.isNotEmpty(word) && !word.equalsIgnoreCase(reverseWord)) {
|
||||
result.add(getOnwWordNature(reverseWord, itemDO, true));
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
private WordNature getOnwWordNature(String word, ItemDO itemDO, boolean isSuffix) {
|
||||
WordNature wordNature = new WordNature();
|
||||
wordNature.setWord(word);
|
||||
Integer classId = itemDO.getDomain();
|
||||
String nature = NatureType.NATURE_SPILT + classId + NatureType.NATURE_SPILT + itemDO.getItemId()
|
||||
+ NatureType.DIMENSION.getType();
|
||||
if (isSuffix) {
|
||||
nature = NatureType.NATURE_SPILT + classId + NatureType.NATURE_SPILT + itemDO.getItemId()
|
||||
+ NatureType.SUFFIX.getType() + NatureType.DIMENSION.getType();
|
||||
}
|
||||
wordNature.setNatureWithFrequency(String.format("%s 100000", nature));
|
||||
return wordNature;
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,29 @@
|
||||
package com.tencent.supersonic.knowledge.application.online;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
import com.tencent.supersonic.common.nlp.ItemDO;
|
||||
import com.tencent.supersonic.common.nlp.NatureType;
|
||||
import com.tencent.supersonic.common.nlp.WordNature;
|
||||
import java.util.List;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
/**
|
||||
* domain word nature
|
||||
*/
|
||||
@Service
|
||||
@Slf4j
|
||||
public class DomainWordNature extends BaseWordNature {
|
||||
|
||||
@Override
|
||||
public List<WordNature> getWordNature(String word, ItemDO itemDO) {
|
||||
List<WordNature> result = Lists.newArrayList();
|
||||
WordNature wordNature = new WordNature();
|
||||
wordNature.setWord(word);
|
||||
Integer classId = itemDO.getDomain();
|
||||
String nature = NatureType.NATURE_SPILT + classId;
|
||||
wordNature.setNatureWithFrequency(String.format("%s 100000", nature));
|
||||
result.add(wordNature);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,31 @@
|
||||
package com.tencent.supersonic.knowledge.application.online;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
import com.tencent.supersonic.common.nlp.ItemDO;
|
||||
import com.tencent.supersonic.common.nlp.NatureType;
|
||||
import com.tencent.supersonic.common.nlp.WordNature;
|
||||
import java.util.List;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
/**
|
||||
* dimension value wordNature
|
||||
*/
|
||||
@Service
|
||||
@Slf4j
|
||||
public class EntityWordNature extends BaseWordNature {
|
||||
|
||||
@Override
|
||||
public List<WordNature> getWordNature(String word, ItemDO itemDO) {
|
||||
List<WordNature> result = Lists.newArrayList();
|
||||
WordNature wordNature = new WordNature();
|
||||
wordNature.setWord(word);
|
||||
Integer domain = itemDO.getDomain();
|
||||
String nature = NatureType.NATURE_SPILT + domain + NatureType.NATURE_SPILT + itemDO.getItemId()
|
||||
+ NatureType.ENTITY.getType();
|
||||
wordNature.setNatureWithFrequency(String.format("%s 200000", nature));
|
||||
result.add(wordNature);
|
||||
return result;
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,48 @@
|
||||
package com.tencent.supersonic.knowledge.application.online;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
import com.tencent.supersonic.common.nlp.ItemDO;
|
||||
import com.tencent.supersonic.common.nlp.NatureType;
|
||||
import com.tencent.supersonic.common.nlp.WordNature;
|
||||
import java.util.List;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
/**
|
||||
* Metric WordNature
|
||||
*/
|
||||
@Service
|
||||
public class MetricWordNature extends BaseWordNature {
|
||||
|
||||
@Value("${nlp.metric.use.suffix:true}")
|
||||
private boolean nlpMetricUseSuffix = true;
|
||||
|
||||
@Override
|
||||
public List<WordNature> getWordNature(String word, ItemDO itemDO) {
|
||||
List<WordNature> result = Lists.newArrayList();
|
||||
result.add(getOnwWordNature(word, itemDO, false));
|
||||
if (nlpMetricUseSuffix) {
|
||||
String reverseWord = StringUtils.reverse(word);
|
||||
if (!word.equalsIgnoreCase(reverseWord)) {
|
||||
result.add(getOnwWordNature(reverseWord, itemDO, true));
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
private WordNature getOnwWordNature(String word, ItemDO itemDO, boolean isSuffix) {
|
||||
WordNature wordNature = new WordNature();
|
||||
wordNature.setWord(word);
|
||||
Integer classId = itemDO.getDomain();
|
||||
String nature = NatureType.NATURE_SPILT + classId + NatureType.NATURE_SPILT + itemDO.getItemId()
|
||||
+ NatureType.METRIC.getType();
|
||||
if (isSuffix) {
|
||||
nature = NatureType.NATURE_SPILT + classId + NatureType.NATURE_SPILT + itemDO.getItemId()
|
||||
+ NatureType.SUFFIX.getType() + NatureType.METRIC.getType();
|
||||
}
|
||||
wordNature.setNatureWithFrequency(String.format("%s 100000", nature));
|
||||
return wordNature;
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,60 @@
|
||||
package com.tencent.supersonic.knowledge.application.online;
|
||||
|
||||
import com.tencent.supersonic.common.nlp.NatureType;
|
||||
import com.tencent.supersonic.common.nlp.WordNature;
|
||||
import com.tencent.supersonic.knowledge.domain.service.OnlineKnowledgeService;
|
||||
import com.tencent.supersonic.knowledge.infrastructure.nlp.HanlpHelper;
|
||||
import com.tencent.supersonic.knowledge.infrastructure.nlp.Suggester;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
/**
|
||||
* online knowledge service impl
|
||||
*/
|
||||
@Service
|
||||
public class OnlineKnowledgeServiceImpl implements OnlineKnowledgeService {
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(OnlineKnowledgeServiceImpl.class);
|
||||
|
||||
public void updateSemanticKnowledge(List<WordNature> natures) {
|
||||
|
||||
List<WordNature> prefixes = natures.stream()
|
||||
.filter(entry -> !entry.getNatureWithFrequency().contains(NatureType.SUFFIX.getType()))
|
||||
.collect(Collectors.toList());
|
||||
|
||||
for (WordNature nature : prefixes) {
|
||||
HanlpHelper.addToCustomDictionary(nature);
|
||||
}
|
||||
|
||||
List<WordNature> suffixes = natures.stream()
|
||||
.filter(entry -> entry.getNatureWithFrequency().contains(NatureType.SUFFIX.getType()))
|
||||
.collect(Collectors.toList());
|
||||
|
||||
Suggester.loadSuffix(suffixes);
|
||||
}
|
||||
|
||||
|
||||
public void reloadAllData(List<WordNature> natures) {
|
||||
// 1. reload custom knowledge
|
||||
try {
|
||||
HanlpHelper.reloadCustomDictionary();
|
||||
} catch (Exception e) {
|
||||
logger.error("reloadCustomDictionary error", e);
|
||||
}
|
||||
|
||||
// 2. update online knowledge
|
||||
updateOnlineKnowledge(natures);
|
||||
}
|
||||
|
||||
public void updateOnlineKnowledge(List<WordNature> natures) {
|
||||
try {
|
||||
updateSemanticKnowledge(natures);
|
||||
} catch (Exception e) {
|
||||
logger.error("updateSemanticKnowledge error", e);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,30 @@
|
||||
package com.tencent.supersonic.knowledge.application.online;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
import com.tencent.supersonic.common.nlp.ItemDO;
|
||||
import com.tencent.supersonic.common.nlp.NatureType;
|
||||
import com.tencent.supersonic.common.nlp.WordNature;
|
||||
import java.util.List;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
/**
|
||||
* dimension value wordNature
|
||||
*/
|
||||
@Service
|
||||
@Slf4j
|
||||
public class ValueWordNature extends BaseWordNature {
|
||||
|
||||
@Override
|
||||
public List<WordNature> getWordNature(String word, ItemDO itemDO) {
|
||||
List<WordNature> result = Lists.newArrayList();
|
||||
WordNature wordNature = new WordNature();
|
||||
wordNature.setWord(word);
|
||||
Integer domain = itemDO.getDomain();
|
||||
String nature = NatureType.NATURE_SPILT + domain + NatureType.NATURE_SPILT + itemDO.getItemId();
|
||||
wordNature.setNatureWithFrequency(String.format("%s 100000", nature));
|
||||
result.add(wordNature);
|
||||
return result;
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,28 @@
|
||||
package com.tencent.supersonic.knowledge.application.online;
|
||||
|
||||
|
||||
import com.tencent.supersonic.common.nlp.NatureType;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
|
||||
/**
|
||||
* WordNature Strategy Factory
|
||||
*/
|
||||
public class WordNatureStrategyFactory {
|
||||
|
||||
private static Map<NatureType, BaseWordNature> strategyFactory = new ConcurrentHashMap<>();
|
||||
|
||||
static {
|
||||
strategyFactory.put(NatureType.DIMENSION, new DimensionWordNature());
|
||||
strategyFactory.put(NatureType.METRIC, new MetricWordNature());
|
||||
strategyFactory.put(NatureType.DOMAIN, new DomainWordNature());
|
||||
strategyFactory.put(NatureType.ENTITY, new EntityWordNature());
|
||||
strategyFactory.put(NatureType.VALUE, new ValueWordNature());
|
||||
|
||||
|
||||
}
|
||||
|
||||
public static BaseWordNature get(NatureType strategyType) {
|
||||
return strategyFactory.get(strategyType);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,66 @@
|
||||
package com.tencent.supersonic.knowledge.domain;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
public interface FileHandler {
|
||||
|
||||
/**
|
||||
* backup files to a specific directory
|
||||
* config: dict.directory.backup
|
||||
*
|
||||
* @param fileName
|
||||
*/
|
||||
void backupFile(String fileName);
|
||||
|
||||
/**
|
||||
* move files to a specific directory
|
||||
* not backup
|
||||
*
|
||||
* @param fileName
|
||||
* @param targetDirectory
|
||||
*/
|
||||
void moveFile(String fileName, String targetDirectory);
|
||||
|
||||
/**
|
||||
* create a directory
|
||||
*
|
||||
* @param path
|
||||
*/
|
||||
void createDir(String path);
|
||||
|
||||
|
||||
Boolean existPath(String path);
|
||||
|
||||
/**
|
||||
* write data to a specific file,
|
||||
* config dir: dict.directory.latest
|
||||
*
|
||||
* @param data
|
||||
* @param fileName
|
||||
* @param append
|
||||
*/
|
||||
void writeFile(List<String> data, String fileName, Boolean append);
|
||||
|
||||
/**
|
||||
* get the knowledge file root directory
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
String getDictRootPath();
|
||||
|
||||
/**
|
||||
* delete dictionary file
|
||||
* automatic backup
|
||||
*
|
||||
* @param fileName
|
||||
* @return
|
||||
*/
|
||||
Boolean deleteDictFile(String fileName);
|
||||
|
||||
/**
|
||||
* delete files directly without backup
|
||||
*
|
||||
* @param fileName
|
||||
*/
|
||||
void deleteFile(String fileName);
|
||||
}
|
||||
@@ -0,0 +1,40 @@
|
||||
package com.tencent.supersonic.knowledge.domain;
|
||||
|
||||
import com.tencent.supersonic.knowledge.infrastructure.nlp.HanlpHelper;
|
||||
import java.io.FileNotFoundException;
|
||||
import lombok.Data;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.context.annotation.Configuration;
|
||||
|
||||
|
||||
@Data
|
||||
@Configuration
|
||||
@Slf4j
|
||||
public class LocalFileConfig {
|
||||
|
||||
|
||||
@Value("${dict.directory.latest:/data/dictionary/custom}")
|
||||
private String dictDirectoryLatest;
|
||||
|
||||
@Value("${dict.directory.backup:./dict/backup}")
|
||||
private String dictDirectoryBackup;
|
||||
|
||||
public String getDictDirectoryLatest() {
|
||||
return getResourceDir() + dictDirectoryLatest;
|
||||
}
|
||||
|
||||
public String getDictDirectoryBackup() {
|
||||
return dictDirectoryBackup;
|
||||
}
|
||||
|
||||
private String getResourceDir() {
|
||||
String hanlpPropertiesPath = "";
|
||||
try {
|
||||
hanlpPropertiesPath = HanlpHelper.getHanlpPropertiesPath();
|
||||
} catch (FileNotFoundException e) {
|
||||
log.warn("getResourceDir, e:", e);
|
||||
}
|
||||
return hanlpPropertiesPath;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,139 @@
|
||||
package com.tencent.supersonic.knowledge.domain;
|
||||
|
||||
|
||||
import java.io.BufferedWriter;
|
||||
import java.io.IOException;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
import java.nio.file.StandardCopyOption;
|
||||
import java.nio.file.StandardOpenOption;
|
||||
import java.util.List;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.springframework.stereotype.Component;
|
||||
import org.springframework.util.CollectionUtils;
|
||||
|
||||
@Slf4j
|
||||
@Component
|
||||
public class LocalFileHandler implements FileHandler {
|
||||
|
||||
private final LocalFileConfig localFileConfig;
|
||||
|
||||
public LocalFileHandler(LocalFileConfig localFileConfig) {
|
||||
this.localFileConfig = localFileConfig;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void backupFile(String fileName) {
|
||||
String dictDirectoryBackup = localFileConfig.getDictDirectoryBackup();
|
||||
if (!existPath(dictDirectoryBackup)) {
|
||||
createDir(dictDirectoryBackup);
|
||||
}
|
||||
|
||||
String source = localFileConfig.getDictDirectoryLatest() + "/" + fileName;
|
||||
String target = dictDirectoryBackup + "/" + fileName;
|
||||
Path sourcePath = Paths.get(source);
|
||||
Path targetPath = Paths.get(target);
|
||||
try {
|
||||
Files.copy(sourcePath, targetPath, StandardCopyOption.REPLACE_EXISTING);
|
||||
log.info("File copied successfully!");
|
||||
} catch (IOException e) {
|
||||
log.info("Failed to copy file: " + e.getMessage());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void moveFile(String filePath, String targetDirectoryPath) {
|
||||
Path sourcePath = Paths.get(filePath);
|
||||
Path targetPath = Paths.get(targetDirectoryPath, sourcePath.getFileName().toString());
|
||||
try {
|
||||
Files.move(sourcePath, targetPath, StandardCopyOption.REPLACE_EXISTING);
|
||||
log.info("File moved successfully!");
|
||||
} catch (IOException e) {
|
||||
log.info("Failed to move file: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void createDir(String directoryPath) {
|
||||
Path path = Paths.get(directoryPath);
|
||||
try {
|
||||
Files.createDirectories(path);
|
||||
log.info("Directory created successfully!");
|
||||
} catch (IOException e) {
|
||||
log.info("Failed to create directory: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void deleteFile(String filePath) {
|
||||
Path path = Paths.get(filePath);
|
||||
try {
|
||||
Files.delete(path);
|
||||
log.info("File:{} deleted successfully!", getAbsolutePath(filePath));
|
||||
} catch (IOException e) {
|
||||
log.info("Failed to delete file:{}, e:", getAbsolutePath(filePath), e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Boolean existPath(String pathStr) {
|
||||
Path path = Paths.get(pathStr);
|
||||
if (Files.exists(path)) {
|
||||
log.info("path:{} exists!", getAbsolutePath(pathStr));
|
||||
return true;
|
||||
} else {
|
||||
log.info("path:{} not exists!", getAbsolutePath(pathStr));
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void writeFile(List<String> lines, String fileName, Boolean append) {
|
||||
String dictDirectoryLatest = localFileConfig.getDictDirectoryLatest();
|
||||
if (!existPath(dictDirectoryLatest)) {
|
||||
createDir(dictDirectoryLatest);
|
||||
}
|
||||
String filePath = dictDirectoryLatest + "/" + fileName;
|
||||
if (existPath(filePath)) {
|
||||
backupFile(fileName);
|
||||
}
|
||||
try (BufferedWriter writer = getWriter(filePath, append)) {
|
||||
if (!CollectionUtils.isEmpty(lines)) {
|
||||
for (String line : lines) {
|
||||
writer.write(line);
|
||||
writer.newLine();
|
||||
}
|
||||
}
|
||||
log.info("File:{} written successfully!", getAbsolutePath(filePath));
|
||||
} catch (IOException e) {
|
||||
log.info("Failed to write file:{}, e:", getAbsolutePath(filePath), e);
|
||||
}
|
||||
}
|
||||
|
||||
public String getAbsolutePath(String path) {
|
||||
return Paths.get(path).toAbsolutePath().toString();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String getDictRootPath() {
|
||||
return Paths.get(localFileConfig.getDictDirectoryLatest()).toAbsolutePath().toString();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Boolean deleteDictFile(String fileName) {
|
||||
backupFile(fileName);
|
||||
deleteFile(localFileConfig.getDictDirectoryLatest() + "/" + fileName);
|
||||
return true;
|
||||
}
|
||||
|
||||
private BufferedWriter getWriter(String filePath, Boolean append) throws IOException {
|
||||
if (append) {
|
||||
return Files.newBufferedWriter(Paths.get(filePath), StandardCharsets.UTF_8, StandardOpenOption.APPEND);
|
||||
}
|
||||
return Files.newBufferedWriter(Paths.get(filePath), StandardCharsets.UTF_8);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,59 @@
|
||||
package com.tencent.supersonic.knowledge.domain.converter;
|
||||
|
||||
import com.google.common.base.Strings;
|
||||
import com.tencent.supersonic.auth.api.authentication.pojo.User;
|
||||
import com.tencent.supersonic.common.enums.TaskStatusEnum;
|
||||
import com.tencent.supersonic.common.util.json.JsonUtil;
|
||||
import com.tencent.supersonic.knowledge.domain.dataobject.DictConfPO;
|
||||
import com.tencent.supersonic.knowledge.domain.dataobject.DimValueDictTaskPO;
|
||||
import com.tencent.supersonic.knowledge.domain.pojo.DictConfig;
|
||||
import com.tencent.supersonic.knowledge.domain.pojo.DimValue2DictCommand;
|
||||
import com.tencent.supersonic.knowledge.domain.pojo.DimValueInfo;
|
||||
import java.time.ZoneId;
|
||||
import java.time.format.DateTimeFormatter;
|
||||
import java.util.Date;
|
||||
import java.util.List;
|
||||
|
||||
public class DictTaskConverter {
|
||||
|
||||
private static String dateTimeFormatter = "yyyyMMddHHmmss";
|
||||
|
||||
public static DimValueDictTaskPO generateDimValueDictTaskPO(DimValue2DictCommand dimValue2DictCommend, User user) {
|
||||
DimValueDictTaskPO taskPO = new DimValueDictTaskPO();
|
||||
Date createAt = new Date();
|
||||
String date = DateTimeFormatter.ofPattern(dateTimeFormatter)
|
||||
.format(createAt.toInstant().atZone(ZoneId.systemDefault()).toLocalDateTime());
|
||||
String creator = Strings.isNullOrEmpty(user.getName()) ? "" : user.getName();
|
||||
String updateMode = dimValue2DictCommend.getUpdateMode().getValue();
|
||||
String name = String.format("DimValue_dic_%s_%s_%s", updateMode, creator, date);
|
||||
taskPO.setName(name);
|
||||
|
||||
taskPO.setCreatedAt(createAt);
|
||||
taskPO.setCommand(JsonUtil.toString(dimValue2DictCommend));
|
||||
taskPO.setStatus(TaskStatusEnum.RUNNING.getCode());
|
||||
taskPO.setCreatedBy(creator);
|
||||
|
||||
return taskPO;
|
||||
}
|
||||
|
||||
public static DictConfPO generateDictConfPO(DictConfig dictConfig, User user) {
|
||||
DictConfPO dictConfPO = new DictConfPO();
|
||||
dictConfPO.setDimValueInfos(JsonUtil.toString(dictConfig.getDimValueInfoList()));
|
||||
dictConfPO.setDomainId(dictConfig.getDomainId());
|
||||
|
||||
dictConfPO.setCreatedBy(user.getName());
|
||||
dictConfPO.setUpdatedBy(user.getName());
|
||||
dictConfPO.setCreatedAt(new Date());
|
||||
dictConfPO.setUpdatedAt(new Date());
|
||||
|
||||
return dictConfPO;
|
||||
}
|
||||
|
||||
public static DictConfig dictConfPO2Config(DictConfPO dictConfPO) {
|
||||
DictConfig dictConfig = new DictConfig();
|
||||
dictConfig.setDomainId(dictConfPO.getDomainId());
|
||||
List<DimValueInfo> dimValueInfos = JsonUtil.toList(dictConfPO.getDimValueInfos(), DimValueInfo.class);
|
||||
dictConfig.setDimValueInfoList(dimValueInfos);
|
||||
return dictConfig;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,20 @@
|
||||
package com.tencent.supersonic.knowledge.domain.dataobject;
|
||||
|
||||
import java.util.Date;
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
public class DictConfPO {
|
||||
|
||||
private Long id;
|
||||
|
||||
private Long domainId;
|
||||
|
||||
private String dimValueInfos;
|
||||
|
||||
private String createdBy;
|
||||
private String updatedBy;
|
||||
private Date createdAt;
|
||||
private Date updatedAt;
|
||||
|
||||
}
|
||||
@@ -0,0 +1,35 @@
|
||||
package com.tencent.supersonic.knowledge.domain.dataobject;
|
||||
|
||||
import java.util.Date;
|
||||
import lombok.Data;
|
||||
import lombok.ToString;
|
||||
import org.apache.commons.codec.digest.DigestUtils;
|
||||
|
||||
@Data
|
||||
@ToString
|
||||
public class DimValueDictTaskPO {
|
||||
|
||||
private Long id;
|
||||
|
||||
private String name;
|
||||
|
||||
private String description;
|
||||
|
||||
private String command;
|
||||
|
||||
private String commandMd5;
|
||||
|
||||
private Integer status;
|
||||
|
||||
private String createdBy;
|
||||
|
||||
private Date createdAt;
|
||||
|
||||
private Double progress;
|
||||
|
||||
private Long elapsedMs;
|
||||
|
||||
public String getCommandMd5() {
|
||||
return DigestUtils.md5Hex(command);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,13 @@
|
||||
package com.tencent.supersonic.knowledge.domain.pojo;
|
||||
|
||||
import java.util.List;
|
||||
import lombok.Data;
|
||||
|
||||
|
||||
@Data
|
||||
public class DictConfig {
|
||||
|
||||
private Long domainId;
|
||||
|
||||
private List<DimValueInfo> dimValueInfoList;
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
package com.tencent.supersonic.knowledge.domain.pojo;
|
||||
|
||||
|
||||
public class DictTaskFilter {
|
||||
|
||||
private Long id;
|
||||
|
||||
private String name;
|
||||
|
||||
private String createdBy;
|
||||
|
||||
private String createdAt;
|
||||
|
||||
private Integer status;
|
||||
}
|
||||
@@ -0,0 +1,30 @@
|
||||
package com.tencent.supersonic.knowledge.domain.pojo;
|
||||
|
||||
public enum DictUpdateMode {
|
||||
|
||||
OFFLINE_FULL("OFFLINE_FULL"),
|
||||
OFFLINE_DOMAIN("OFFLINE_DOMAIN"),
|
||||
REALTIME_ADD("REALTIME_ADD"),
|
||||
REALTIME_DELETE("REALTIME_DELETE"),
|
||||
NOT_SUPPORT("NOT_SUPPORT");
|
||||
|
||||
private String value;
|
||||
|
||||
DictUpdateMode(String value) {
|
||||
this.value = value;
|
||||
}
|
||||
|
||||
public static DictUpdateMode of(String value) {
|
||||
for (DictUpdateMode item : DictUpdateMode.values()) {
|
||||
if (item.value.equalsIgnoreCase(value)) {
|
||||
return item;
|
||||
}
|
||||
}
|
||||
return DictUpdateMode.NOT_SUPPORT;
|
||||
}
|
||||
|
||||
public String getValue() {
|
||||
return value;
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,17 @@
|
||||
package com.tencent.supersonic.knowledge.domain.pojo;
|
||||
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
public class DimValue2DictCommand {
|
||||
|
||||
private DictUpdateMode updateMode;
|
||||
|
||||
private List<Long> domainIds;
|
||||
|
||||
private Map<Long, List<Long>> domainAndDimPair = new HashMap<>();
|
||||
}
|
||||
@@ -0,0 +1,26 @@
|
||||
package com.tencent.supersonic.knowledge.domain.pojo;
|
||||
|
||||
|
||||
import com.tencent.supersonic.common.enums.TaskStatusEnum;
|
||||
import java.util.Date;
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
public class DimValueDictInfo {
|
||||
|
||||
private Long id;
|
||||
|
||||
private String name;
|
||||
|
||||
private String description;
|
||||
|
||||
private String command;
|
||||
|
||||
private TaskStatusEnum status;
|
||||
|
||||
private String createdBy;
|
||||
|
||||
private Date createdAt;
|
||||
|
||||
private Long elapsedMs;
|
||||
}
|
||||
@@ -0,0 +1,26 @@
|
||||
package com.tencent.supersonic.knowledge.domain.pojo;
|
||||
|
||||
|
||||
import com.tencent.supersonic.common.enums.TypeEnums;
|
||||
import java.util.List;
|
||||
import javax.validation.constraints.NotNull;
|
||||
|
||||
public class DimValueInfo {
|
||||
|
||||
/**
|
||||
* metricId、DimensionId、domainId
|
||||
*/
|
||||
private Long itemId;
|
||||
|
||||
/**
|
||||
* type: IntentionTypeEnum
|
||||
* temporarily only supports dimension-related information
|
||||
*/
|
||||
@NotNull
|
||||
private TypeEnums type = TypeEnums.DIMENSION;
|
||||
|
||||
private List<String> blackList;
|
||||
private List<String> whiteList;
|
||||
private List<String> ruleList;
|
||||
private Boolean isDictInfo;
|
||||
}
|
||||
@@ -0,0 +1,25 @@
|
||||
package com.tencent.supersonic.knowledge.domain.repository;
|
||||
|
||||
import com.tencent.supersonic.knowledge.domain.dataobject.DictConfPO;
|
||||
import com.tencent.supersonic.knowledge.domain.dataobject.DimValueDictTaskPO;
|
||||
import com.tencent.supersonic.knowledge.domain.pojo.DictConfig;
|
||||
import com.tencent.supersonic.knowledge.domain.pojo.DictTaskFilter;
|
||||
import com.tencent.supersonic.knowledge.domain.pojo.DimValueDictInfo;
|
||||
import java.util.List;
|
||||
|
||||
public interface DictRepository {
|
||||
|
||||
Long createDimValueDictTask(DimValueDictTaskPO dimValueDictTaskPO);
|
||||
|
||||
Boolean updateDictTaskStatus(Integer status, DimValueDictTaskPO dimValueDictTaskPO);
|
||||
|
||||
List<DimValueDictInfo> searchDictTaskList(DictTaskFilter filter);
|
||||
|
||||
Boolean createDictConf(DictConfPO dictConfPO);
|
||||
|
||||
Boolean editDictConf(DictConfPO dictConfPO);
|
||||
|
||||
Boolean upsertDictInfo(DictConfPO dictConfPO);
|
||||
|
||||
DictConfig getDictInfoByDomainId(Long domainId);
|
||||
}
|
||||
@@ -0,0 +1,17 @@
|
||||
package com.tencent.supersonic.knowledge.domain.service;
|
||||
|
||||
import com.tencent.supersonic.common.nlp.WordNature;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* online knowledge service interface
|
||||
*/
|
||||
public interface OnlineKnowledgeService {
|
||||
|
||||
void updateSemanticKnowledge(List<WordNature> natures);
|
||||
|
||||
void reloadAllData(List<WordNature> natures);
|
||||
|
||||
void updateOnlineKnowledge(List<WordNature> natures);
|
||||
|
||||
}
|
||||
@@ -0,0 +1,17 @@
|
||||
package com.tencent.supersonic.knowledge.infrastructure.custom;
|
||||
|
||||
|
||||
import com.tencent.supersonic.knowledge.domain.dataobject.DictConfPO;
|
||||
import org.apache.ibatis.annotations.Mapper;
|
||||
|
||||
@Mapper
|
||||
public interface DictConfMapper {
|
||||
|
||||
Boolean createDictConf(DictConfPO dictConfPO);
|
||||
|
||||
Boolean editDictConf(DictConfPO dictConfPO);
|
||||
|
||||
Boolean upsertDictInfo(DictConfPO dictConfPO);
|
||||
|
||||
DictConfPO getDictInfoByDomainId(Long domainId);
|
||||
}
|
||||
@@ -0,0 +1,16 @@
|
||||
package com.tencent.supersonic.knowledge.infrastructure.custom;
|
||||
|
||||
import com.tencent.supersonic.knowledge.domain.dataobject.DimValueDictTaskPO;
|
||||
import com.tencent.supersonic.knowledge.domain.pojo.DictTaskFilter;
|
||||
import java.util.List;
|
||||
import org.apache.ibatis.annotations.Mapper;
|
||||
|
||||
@Mapper
|
||||
public interface DictTaskMapper {
|
||||
|
||||
Long createDimValueTask(DimValueDictTaskPO dimValueDictTaskPO);
|
||||
|
||||
Boolean updateTaskStatus(DimValueDictTaskPO dimValueDictTaskPO);
|
||||
|
||||
List<DimValueDictTaskPO> searchDictTaskList(DictTaskFilter filter);
|
||||
}
|
||||
@@ -0,0 +1,35 @@
|
||||
package com.tencent.supersonic.knowledge.infrastructure.nlp;
|
||||
|
||||
import com.hankcs.hanlp.corpus.tag.Nature;
|
||||
import com.hankcs.hanlp.dictionary.CoreDictionary;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.IntStream;
|
||||
|
||||
/**
|
||||
* Dictionary Attribute Util
|
||||
*/
|
||||
public class DictionaryAttributeUtil {
|
||||
|
||||
public static CoreDictionary.Attribute getAttribute(CoreDictionary.Attribute old, CoreDictionary.Attribute add) {
|
||||
Map<Nature, Integer> map = new HashMap<>();
|
||||
IntStream.range(0, old.nature.length).boxed().forEach(i -> map.put(old.nature[i], old.frequency[i]));
|
||||
IntStream.range(0, add.nature.length).boxed().forEach(i -> map.put(add.nature[i], add.frequency[i]));
|
||||
List<Map.Entry<Nature, Integer>> list = new LinkedList<Map.Entry<Nature, Integer>>(map.entrySet());
|
||||
Collections.sort(list, new Comparator<Map.Entry<Nature, Integer>>() {
|
||||
public int compare(Map.Entry<Nature, Integer> o1, Map.Entry<Nature, Integer> o2) {
|
||||
return o2.getValue() - o1.getValue();
|
||||
}
|
||||
});
|
||||
CoreDictionary.Attribute attribute = new CoreDictionary.Attribute(
|
||||
list.stream().map(i -> i.getKey()).collect(Collectors.toList()).toArray(new Nature[0]),
|
||||
list.stream().map(i -> i.getValue()).mapToInt(Integer::intValue).toArray(),
|
||||
list.stream().map(i -> i.getValue()).findFirst().get());
|
||||
return attribute;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,81 @@
|
||||
package com.tencent.supersonic.knowledge.infrastructure.nlp;
|
||||
|
||||
import static com.hankcs.hanlp.HanLP.Config.CustomDictionaryPath;
|
||||
import static com.tencent.supersonic.knowledge.infrastructure.nlp.HanlpHelper.FILE_SPILT;
|
||||
|
||||
import com.hankcs.hanlp.dictionary.DynamicCustomDictionary;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
public class FileHelper {
|
||||
|
||||
private static final Logger LOGGER = LoggerFactory.getLogger(FileHelper.class);
|
||||
|
||||
public static void deleteCacheFile(String[] path) throws IOException {
|
||||
|
||||
String customPath = getCustomPath(path);
|
||||
File customFolder = new File(customPath);
|
||||
|
||||
File[] customSubFiles = getFileList(customFolder, ".bin");
|
||||
|
||||
for (File file : customSubFiles) {
|
||||
try {
|
||||
file.delete();
|
||||
LOGGER.info("customPath:{},delete cache file:{}", customPath, file);
|
||||
} catch (Exception e) {
|
||||
LOGGER.error("delete " + file, e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static File[] getFileList(File customFolder, String suffix) {
|
||||
File[] customSubFiles = customFolder.listFiles(file -> {
|
||||
if (file.isDirectory()) {
|
||||
return false;
|
||||
}
|
||||
if (file.getName().toLowerCase().endsWith(suffix)) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
});
|
||||
return customSubFiles;
|
||||
}
|
||||
|
||||
private static String getCustomPath(String[] path) {
|
||||
return path[0].substring(0, path[0].lastIndexOf(FILE_SPILT)) + FILE_SPILT;
|
||||
}
|
||||
|
||||
/**
|
||||
* reset path
|
||||
*
|
||||
* @param customDictionary
|
||||
*/
|
||||
public static void resetCustomPath(DynamicCustomDictionary customDictionary) {
|
||||
String[] path = CustomDictionaryPath;
|
||||
|
||||
String customPath = getCustomPath(path);
|
||||
File customFolder = new File(customPath);
|
||||
|
||||
File[] customSubFiles = getFileList(customFolder, ".txt");
|
||||
|
||||
List<String> fileList = new ArrayList<>();
|
||||
|
||||
for (File file : customSubFiles) {
|
||||
if (file.isFile()) {
|
||||
fileList.add(file.getAbsolutePath());
|
||||
}
|
||||
}
|
||||
|
||||
LOGGER.info("CustomDictionaryPath:{}", fileList);
|
||||
CustomDictionaryPath = fileList.toArray(new String[0]);
|
||||
customDictionary.path = (CustomDictionaryPath == null || CustomDictionaryPath.length == 0) ? path
|
||||
: CustomDictionaryPath;
|
||||
if (CustomDictionaryPath == null || CustomDictionaryPath.length == 0) {
|
||||
CustomDictionaryPath = path;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,33 @@
|
||||
package com.tencent.supersonic.knowledge.infrastructure.nlp;
|
||||
|
||||
import com.hankcs.hanlp.corpus.io.IIOAdapter;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.OutputStream;
|
||||
import java.net.URI;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
public class HadoopFileIOAdapter implements IIOAdapter {
|
||||
|
||||
private static final Logger LOGGER = LoggerFactory.getLogger(HadoopFileIOAdapter.class);
|
||||
|
||||
@Override
|
||||
public InputStream open(String path) throws IOException {
|
||||
LOGGER.info("open:{}", path);
|
||||
Configuration conf = new Configuration();
|
||||
FileSystem fs = FileSystem.get(URI.create(path), conf);
|
||||
return fs.open(new Path(path));
|
||||
}
|
||||
|
||||
@Override
|
||||
public OutputStream create(String path) throws IOException {
|
||||
LOGGER.info("create:{}", path);
|
||||
Configuration conf = new Configuration();
|
||||
FileSystem fs = FileSystem.get(URI.create(path), conf);
|
||||
return fs.create(new Path(path));
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,157 @@
|
||||
package com.tencent.supersonic.knowledge.infrastructure.nlp;
|
||||
|
||||
import static com.hankcs.hanlp.HanLP.Config.CustomDictionaryPath;
|
||||
|
||||
import com.hankcs.hanlp.HanLP;
|
||||
import com.hankcs.hanlp.dictionary.DynamicCustomDictionary;
|
||||
import com.hankcs.hanlp.seg.Segment;
|
||||
import com.tencent.supersonic.common.nlp.WordNature;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.util.ResourceUtils;
|
||||
|
||||
/**
|
||||
* HanLP helper
|
||||
*/
|
||||
public class HanlpHelper {
|
||||
|
||||
private static final Logger LOGGER = LoggerFactory.getLogger(HanlpHelper.class);
|
||||
public static final String FILE_SPILT = "/";
|
||||
public static final String SPACE_SPILT = "#";
|
||||
private static volatile Segment segment;
|
||||
public static volatile DynamicCustomDictionary CustomDictionary;
|
||||
public static final String DICT_MAIN_FILE_NAME = "CustomDictionary.txt";
|
||||
public static final String DICT_CLASS = "classes";
|
||||
public static final String NER = "crf";
|
||||
|
||||
static {
|
||||
// reset hanlp config
|
||||
try {
|
||||
resetHanlpConfig();
|
||||
} catch (FileNotFoundException e) {
|
||||
LOGGER.error("resetHanlpConfig error", e);
|
||||
}
|
||||
}
|
||||
|
||||
public static Segment getSegment() {
|
||||
if (segment == null) {
|
||||
synchronized (HanlpHelper.class) {
|
||||
if (segment == null) {
|
||||
segment = HanLP.newSegment(NER)
|
||||
.enableIndexMode(true).enableIndexMode(4)
|
||||
.enableCustomDictionary(true).enableCustomDictionaryForcing(true).enableOffset(true)
|
||||
.enableJapaneseNameRecognize(false).enableNameRecognize(false)
|
||||
.enableAllNamedEntityRecognize(false)
|
||||
.enableJapaneseNameRecognize(false).enableNumberQuantifierRecognize(false)
|
||||
.enablePlaceRecognize(false)
|
||||
.enableOrganizationRecognize(false).enableCustomDictionary(getDynamicCustomDictionary());
|
||||
}
|
||||
}
|
||||
}
|
||||
return segment;
|
||||
}
|
||||
|
||||
public static DynamicCustomDictionary getDynamicCustomDictionary() {
|
||||
if (CustomDictionary == null) {
|
||||
synchronized (HanlpHelper.class) {
|
||||
if (CustomDictionary == null) {
|
||||
CustomDictionary = new MultiCustomDictionary(CustomDictionaryPath);
|
||||
}
|
||||
}
|
||||
}
|
||||
return CustomDictionary;
|
||||
}
|
||||
|
||||
/***
|
||||
* reload custom dictionary
|
||||
*/
|
||||
public static boolean reloadCustomDictionary() throws IOException {
|
||||
|
||||
LOGGER.info("reloadCustomDictionary start");
|
||||
|
||||
final long startTime = System.currentTimeMillis();
|
||||
|
||||
if (CustomDictionaryPath == null || CustomDictionaryPath.length == 0) {
|
||||
return false;
|
||||
}
|
||||
if (HanLP.Config.IOAdapter instanceof HadoopFileIOAdapter) {
|
||||
// 1.delete hdfs file
|
||||
HdfsFileHelper.deleteCacheFile(CustomDictionaryPath);
|
||||
// 2.query txt files,update CustomDictionaryPath
|
||||
HdfsFileHelper.resetCustomPath(getDynamicCustomDictionary());
|
||||
} else {
|
||||
FileHelper.deleteCacheFile(CustomDictionaryPath);
|
||||
FileHelper.resetCustomPath(getDynamicCustomDictionary());
|
||||
}
|
||||
// 3.clear trie
|
||||
Suggester.clear();
|
||||
|
||||
boolean reload = getDynamicCustomDictionary().reload();
|
||||
LOGGER.info("reloadCustomDictionary end ,cost:{},reload:{}", System.currentTimeMillis() - startTime, reload);
|
||||
return reload;
|
||||
}
|
||||
|
||||
private static void resetHanlpConfig() throws FileNotFoundException {
|
||||
if (HanLP.Config.IOAdapter instanceof HadoopFileIOAdapter) {
|
||||
return;
|
||||
}
|
||||
String hanlpPropertiesPath = getHanlpPropertiesPath();
|
||||
|
||||
CustomDictionaryPath = Arrays.stream(CustomDictionaryPath).map(path -> hanlpPropertiesPath + FILE_SPILT + path)
|
||||
.toArray(String[]::new);
|
||||
LOGGER.info("hanlpPropertiesPath:{},CustomDictionaryPath:{}", hanlpPropertiesPath, CustomDictionaryPath);
|
||||
|
||||
HanLP.Config.CoreDictionaryPath = hanlpPropertiesPath + FILE_SPILT + HanLP.Config.BiGramDictionaryPath;
|
||||
HanLP.Config.CoreDictionaryTransformMatrixDictionaryPath = hanlpPropertiesPath + FILE_SPILT
|
||||
+ HanLP.Config.CoreDictionaryTransformMatrixDictionaryPath;
|
||||
HanLP.Config.BiGramDictionaryPath = hanlpPropertiesPath + FILE_SPILT + HanLP.Config.BiGramDictionaryPath;
|
||||
HanLP.Config.CoreStopWordDictionaryPath =
|
||||
hanlpPropertiesPath + FILE_SPILT + HanLP.Config.CoreStopWordDictionaryPath;
|
||||
HanLP.Config.CoreSynonymDictionaryDictionaryPath = hanlpPropertiesPath + FILE_SPILT
|
||||
+ HanLP.Config.CoreSynonymDictionaryDictionaryPath;
|
||||
HanLP.Config.PersonDictionaryPath = hanlpPropertiesPath + FILE_SPILT + HanLP.Config.PersonDictionaryPath;
|
||||
HanLP.Config.PersonDictionaryTrPath = hanlpPropertiesPath + FILE_SPILT + HanLP.Config.PersonDictionaryTrPath;
|
||||
|
||||
HanLP.Config.PinyinDictionaryPath = hanlpPropertiesPath + FILE_SPILT + HanLP.Config.PinyinDictionaryPath;
|
||||
HanLP.Config.TranslatedPersonDictionaryPath = hanlpPropertiesPath + FILE_SPILT
|
||||
+ HanLP.Config.TranslatedPersonDictionaryPath;
|
||||
HanLP.Config.JapanesePersonDictionaryPath = hanlpPropertiesPath + FILE_SPILT
|
||||
+ HanLP.Config.JapanesePersonDictionaryPath;
|
||||
HanLP.Config.PlaceDictionaryPath = hanlpPropertiesPath + FILE_SPILT + HanLP.Config.PlaceDictionaryPath;
|
||||
HanLP.Config.PlaceDictionaryTrPath = hanlpPropertiesPath + FILE_SPILT + HanLP.Config.PlaceDictionaryTrPath;
|
||||
HanLP.Config.OrganizationDictionaryPath = hanlpPropertiesPath + FILE_SPILT
|
||||
+ HanLP.Config.OrganizationDictionaryPath;
|
||||
HanLP.Config.OrganizationDictionaryTrPath = hanlpPropertiesPath + FILE_SPILT
|
||||
+ HanLP.Config.OrganizationDictionaryTrPath;
|
||||
HanLP.Config.CharTypePath = hanlpPropertiesPath + FILE_SPILT + HanLP.Config.CharTypePath;
|
||||
HanLP.Config.CharTablePath = hanlpPropertiesPath + FILE_SPILT + HanLP.Config.CharTablePath;
|
||||
HanLP.Config.PartOfSpeechTagDictionary =
|
||||
hanlpPropertiesPath + FILE_SPILT + HanLP.Config.PartOfSpeechTagDictionary;
|
||||
HanLP.Config.WordNatureModelPath = hanlpPropertiesPath + FILE_SPILT + HanLP.Config.WordNatureModelPath;
|
||||
HanLP.Config.MaxEntModelPath = hanlpPropertiesPath + FILE_SPILT + HanLP.Config.MaxEntModelPath;
|
||||
HanLP.Config.NNParserModelPath = hanlpPropertiesPath + FILE_SPILT + HanLP.Config.NNParserModelPath;
|
||||
HanLP.Config.PerceptronParserModelPath =
|
||||
hanlpPropertiesPath + FILE_SPILT + HanLP.Config.PerceptronParserModelPath;
|
||||
HanLP.Config.CRFSegmentModelPath = hanlpPropertiesPath + FILE_SPILT + HanLP.Config.CRFSegmentModelPath;
|
||||
HanLP.Config.HMMSegmentModelPath = hanlpPropertiesPath + FILE_SPILT + HanLP.Config.HMMSegmentModelPath;
|
||||
HanLP.Config.CRFCWSModelPath = hanlpPropertiesPath + FILE_SPILT + HanLP.Config.CRFCWSModelPath;
|
||||
HanLP.Config.CRFPOSModelPath = hanlpPropertiesPath + FILE_SPILT + HanLP.Config.CRFPOSModelPath;
|
||||
HanLP.Config.CRFNERModelPath = hanlpPropertiesPath + FILE_SPILT + HanLP.Config.CRFNERModelPath;
|
||||
HanLP.Config.PerceptronCWSModelPath = hanlpPropertiesPath + FILE_SPILT + HanLP.Config.PerceptronCWSModelPath;
|
||||
HanLP.Config.PerceptronPOSModelPath = hanlpPropertiesPath + FILE_SPILT + HanLP.Config.PerceptronPOSModelPath;
|
||||
HanLP.Config.PerceptronNERModelPath = hanlpPropertiesPath + FILE_SPILT + HanLP.Config.PerceptronNERModelPath;
|
||||
}
|
||||
|
||||
public static String getHanlpPropertiesPath() throws FileNotFoundException {
|
||||
return ResourceUtils.getFile("classpath:hanlp.properties").getParent();
|
||||
}
|
||||
|
||||
public static boolean addToCustomDictionary(WordNature wordNature) {
|
||||
LOGGER.debug("wordNature:{}", wordNature);
|
||||
return getDynamicCustomDictionary().insert(wordNature.getWord(), wordNature.getNatureWithFrequency());
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,85 @@
|
||||
package com.tencent.supersonic.knowledge.infrastructure.nlp;
|
||||
|
||||
import static com.hankcs.hanlp.HanLP.Config.CustomDictionaryPath;
|
||||
import static com.tencent.supersonic.knowledge.infrastructure.nlp.HanlpHelper.FILE_SPILT;
|
||||
|
||||
import com.hankcs.hanlp.dictionary.DynamicCustomDictionary;
|
||||
import com.hankcs.hanlp.utility.Predefine;
|
||||
import java.io.IOException;
|
||||
import java.net.URI;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileStatus;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
/**
|
||||
* Hdfs File Helper
|
||||
*/
|
||||
public class HdfsFileHelper {
|
||||
|
||||
private static final Logger LOGGER = LoggerFactory.getLogger(HdfsFileHelper.class);
|
||||
|
||||
/***
|
||||
* delete cache file
|
||||
* @param path
|
||||
* @throws IOException
|
||||
*/
|
||||
public static void deleteCacheFile(String[] path) throws IOException {
|
||||
FileSystem fs = FileSystem.get(URI.create(path[0]), new Configuration());
|
||||
String cacheFilePath = path[0] + Predefine.BIN_EXT;
|
||||
LOGGER.info("delete cache file:{}", cacheFilePath);
|
||||
try {
|
||||
fs.delete(new Path(cacheFilePath), false);
|
||||
} catch (Exception e) {
|
||||
LOGGER.error("delete:" + cacheFilePath, e);
|
||||
}
|
||||
int customBase = cacheFilePath.lastIndexOf(FILE_SPILT);
|
||||
String customPath = cacheFilePath.substring(0, customBase) + FILE_SPILT + "*.bin";
|
||||
List<String> fileList = getFileList(fs, new Path(customPath));
|
||||
for (String file : fileList) {
|
||||
try {
|
||||
fs.delete(new Path(file), false);
|
||||
LOGGER.info("delete cache file:{}", file);
|
||||
} catch (Exception e) {
|
||||
LOGGER.error("delete " + file, e);
|
||||
}
|
||||
}
|
||||
LOGGER.info("fileList:{}", fileList);
|
||||
}
|
||||
|
||||
/**
|
||||
* reset path
|
||||
* @param customDictionary
|
||||
* @throws IOException
|
||||
*/
|
||||
public static void resetCustomPath(DynamicCustomDictionary customDictionary) throws IOException {
|
||||
String[] path = CustomDictionaryPath;
|
||||
FileSystem fs = FileSystem.get(URI.create(path[0]), new Configuration());
|
||||
String cacheFilePath = path[0] + Predefine.BIN_EXT;
|
||||
int customBase = cacheFilePath.lastIndexOf(FILE_SPILT);
|
||||
String customPath = cacheFilePath.substring(0, customBase) + FILE_SPILT + "*.txt";
|
||||
LOGGER.info("customPath:{}", customPath);
|
||||
List<String> fileList = getFileList(fs, new Path(customPath));
|
||||
LOGGER.info("CustomDictionaryPath:{}", fileList);
|
||||
CustomDictionaryPath = fileList.toArray(new String[0]);
|
||||
customDictionary.path = (CustomDictionaryPath == null || CustomDictionaryPath.length == 0) ? path
|
||||
: CustomDictionaryPath;
|
||||
if (CustomDictionaryPath == null || CustomDictionaryPath.length == 0) {
|
||||
CustomDictionaryPath = path;
|
||||
}
|
||||
}
|
||||
|
||||
public static List<String> getFileList(FileSystem fs, Path folderPath) throws IOException {
|
||||
List<String> paths = new ArrayList();
|
||||
FileStatus[] fileStatuses = fs.globStatus(folderPath);
|
||||
for (int i = 0; i < fileStatuses.length; i++) {
|
||||
FileStatus fileStatus = fileStatuses[i];
|
||||
paths.add(fileStatus.getPath().toString());
|
||||
}
|
||||
return paths;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,368 @@
|
||||
package com.tencent.supersonic.knowledge.infrastructure.nlp;
|
||||
|
||||
import static com.hankcs.hanlp.utility.Predefine.logger;
|
||||
|
||||
import com.hankcs.hanlp.HanLP;
|
||||
import com.hankcs.hanlp.collection.trie.DoubleArrayTrie;
|
||||
import com.hankcs.hanlp.collection.trie.bintrie.BinTrie;
|
||||
import com.hankcs.hanlp.corpus.io.ByteArray;
|
||||
import com.hankcs.hanlp.corpus.io.IOUtil;
|
||||
import com.hankcs.hanlp.corpus.tag.Nature;
|
||||
import com.hankcs.hanlp.dictionary.CoreDictionary;
|
||||
import com.hankcs.hanlp.dictionary.DynamicCustomDictionary;
|
||||
import com.hankcs.hanlp.dictionary.other.CharTable;
|
||||
import com.hankcs.hanlp.utility.LexiconUtility;
|
||||
import com.hankcs.hanlp.utility.Predefine;
|
||||
import com.hankcs.hanlp.utility.TextUtility;
|
||||
|
||||
import java.io.BufferedOutputStream;
|
||||
import java.io.BufferedReader;
|
||||
import java.io.DataOutputStream;
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.util.Arrays;
|
||||
import java.util.LinkedHashSet;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.TreeMap;
|
||||
|
||||
public class MultiCustomDictionary extends DynamicCustomDictionary {
|
||||
|
||||
public static Boolean removeDuplicates = true;
|
||||
private static boolean addToSuggesterTrie = true;
|
||||
|
||||
public MultiCustomDictionary() {
|
||||
this(HanLP.Config.CustomDictionaryPath);
|
||||
}
|
||||
|
||||
public MultiCustomDictionary(String... path) {
|
||||
super(path);
|
||||
}
|
||||
|
||||
public boolean load(String... path) {
|
||||
this.path = path;
|
||||
long start = System.currentTimeMillis();
|
||||
if (!this.loadMainDictionary(path[0])) {
|
||||
Predefine.logger.warning("自定义词典" + Arrays.toString(path) + "加载失败");
|
||||
return false;
|
||||
} else {
|
||||
Predefine.logger.info(
|
||||
"自定义词典加载成功:" + this.dat.size() + "个词条,耗时" + (System.currentTimeMillis() - start) + "ms");
|
||||
this.path = path;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
/***
|
||||
* load dictionary
|
||||
* @param path
|
||||
* @param defaultNature
|
||||
* @param map
|
||||
* @param customNatureCollector
|
||||
* @param addToSuggeterTrie
|
||||
* @return
|
||||
*/
|
||||
public static boolean load(String path, Nature defaultNature, TreeMap<String, CoreDictionary.Attribute> map,
|
||||
LinkedHashSet<Nature> customNatureCollector, boolean addToSuggeterTrie) {
|
||||
try {
|
||||
String splitter = "\\s";
|
||||
if (path.endsWith(".csv")) {
|
||||
splitter = ",";
|
||||
}
|
||||
|
||||
BufferedReader br = new BufferedReader(new InputStreamReader(IOUtil.newInputStream(path), "UTF-8"));
|
||||
boolean firstLine = true;
|
||||
|
||||
while (true) {
|
||||
String[] param;
|
||||
do {
|
||||
String line;
|
||||
if ((line = br.readLine()) == null) {
|
||||
br.close();
|
||||
return true;
|
||||
}
|
||||
|
||||
if (firstLine) {
|
||||
line = IOUtil.removeUTF8BOM(line);
|
||||
firstLine = false;
|
||||
}
|
||||
|
||||
param = line.split(splitter);
|
||||
} while (param[0].length() == 0);
|
||||
|
||||
if (HanLP.Config.Normalization) {
|
||||
param[0] = CharTable.convert(param[0]);
|
||||
}
|
||||
|
||||
int natureCount = (param.length - 1) / 2;
|
||||
CoreDictionary.Attribute attribute;
|
||||
boolean isLetters = isLetters(param[0]);
|
||||
String original = null;
|
||||
String word = getWordBySpace(param[0]);
|
||||
if (isLetters) {
|
||||
original = word;
|
||||
word = word.toLowerCase();
|
||||
}
|
||||
if (natureCount == 0) {
|
||||
attribute = new CoreDictionary.Attribute(defaultNature);
|
||||
} else {
|
||||
attribute = new CoreDictionary.Attribute(natureCount);
|
||||
|
||||
for (int i = 0; i < natureCount; ++i) {
|
||||
attribute.nature[i] = LexiconUtility.convertStringToNature(param[1 + 2 * i],
|
||||
customNatureCollector);
|
||||
attribute.frequency[i] = Integer.parseInt(param[2 + 2 * i]);
|
||||
attribute.totalFrequency += attribute.frequency[i];
|
||||
}
|
||||
}
|
||||
attribute.original = original;
|
||||
if (removeDuplicates && map.containsKey(word)) {
|
||||
attribute = DictionaryAttributeUtil.getAttribute(map.get(word), attribute);
|
||||
map.put(word, attribute);
|
||||
if (addToSuggeterTrie) {
|
||||
Suggester.put(word, attribute);
|
||||
}
|
||||
|
||||
} else {
|
||||
map.put(word, attribute);
|
||||
if (addToSuggeterTrie) {
|
||||
Suggester.put(word, attribute);
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (Exception var12) {
|
||||
logger.severe("自定义词典" + path + "读取错误!" + var12);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
public boolean loadMainDictionary(String mainPath) {
|
||||
return loadMainDictionary(mainPath, this.path, this.dat, true, addToSuggesterTrie);
|
||||
}
|
||||
|
||||
/***
|
||||
* load main dictionary
|
||||
* @param mainPath
|
||||
* @param path
|
||||
* @param dat
|
||||
* @param isCache
|
||||
* @param addToSuggestTrie
|
||||
* @return
|
||||
*/
|
||||
public static boolean loadMainDictionary(String mainPath, String[] path,
|
||||
DoubleArrayTrie<CoreDictionary.Attribute> dat, boolean isCache, boolean addToSuggestTrie) {
|
||||
Predefine.logger.info("自定义词典开始加载:" + mainPath);
|
||||
if (loadDat(mainPath, dat)) {
|
||||
return true;
|
||||
} else {
|
||||
TreeMap<String, CoreDictionary.Attribute> map = new TreeMap();
|
||||
LinkedHashSet customNatureCollector = new LinkedHashSet();
|
||||
|
||||
try {
|
||||
for (String p : path) {
|
||||
Nature defaultNature = Nature.n;
|
||||
File file = new File(p);
|
||||
String fileName = file.getName();
|
||||
int cut = fileName.lastIndexOf(32);
|
||||
if (cut > 0) {
|
||||
String nature = fileName.substring(cut + 1);
|
||||
p = file.getParent() + File.separator + fileName.substring(0, cut);
|
||||
|
||||
try {
|
||||
defaultNature = LexiconUtility.convertStringToNature(nature, customNatureCollector);
|
||||
} catch (Exception var16) {
|
||||
Predefine.logger.severe("配置文件【" + p + "】写错了!" + var16);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
Predefine.logger.info("以默认词性[" + defaultNature + "]加载自定义词典" + p + "中……");
|
||||
boolean success = load(p, defaultNature, map, customNatureCollector, addToSuggestTrie);
|
||||
if (!success) {
|
||||
Predefine.logger.warning("失败:" + p);
|
||||
}
|
||||
}
|
||||
|
||||
if (map.size() == 0) {
|
||||
Predefine.logger.warning("没有加载到任何词条");
|
||||
map.put("未##它", null);
|
||||
}
|
||||
|
||||
logger.info("正在构建DoubleArrayTrie……");
|
||||
dat.build(map);
|
||||
if (addToSuggestTrie) {
|
||||
// Suggester.save();
|
||||
}
|
||||
if (isCache) {
|
||||
// 缓存成dat文件,下次加载会快很多
|
||||
logger.info("正在缓存词典为dat文件……");
|
||||
// 缓存值文件
|
||||
List<CoreDictionary.Attribute> attributeList = new LinkedList<CoreDictionary.Attribute>();
|
||||
for (Map.Entry<String, CoreDictionary.Attribute> entry : map.entrySet()) {
|
||||
attributeList.add(entry.getValue());
|
||||
}
|
||||
|
||||
DataOutputStream out = new DataOutputStream(
|
||||
new BufferedOutputStream(IOUtil.newOutputStream(mainPath + ".bin")));
|
||||
if (customNatureCollector.isEmpty()) {
|
||||
for (int i = Nature.begin.ordinal() + 1; i < Nature.values().length; ++i) {
|
||||
customNatureCollector.add(Nature.values()[i]);
|
||||
}
|
||||
}
|
||||
|
||||
IOUtil.writeCustomNature(out, customNatureCollector);
|
||||
out.writeInt(attributeList.size());
|
||||
|
||||
for (CoreDictionary.Attribute attribute : attributeList) {
|
||||
attribute.save(out);
|
||||
}
|
||||
|
||||
dat.save(out);
|
||||
out.close();
|
||||
}
|
||||
} catch (FileNotFoundException var17) {
|
||||
logger.severe("自定义词典" + mainPath + "不存在!" + var17);
|
||||
return false;
|
||||
} catch (IOException var18) {
|
||||
logger.severe("自定义词典" + mainPath + "读取错误!" + var18);
|
||||
return false;
|
||||
} catch (Exception var19) {
|
||||
logger.warning("自定义词典" + mainPath + "缓存失败!\n" + TextUtility.exceptionToString(var19));
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
public static boolean loadDat(String path, DoubleArrayTrie<CoreDictionary.Attribute> dat) {
|
||||
return loadDat(path, HanLP.Config.CustomDictionaryPath, dat);
|
||||
}
|
||||
|
||||
public static boolean loadDat(String path, String[] customDicPath, DoubleArrayTrie<CoreDictionary.Attribute> dat) {
|
||||
try {
|
||||
if (HanLP.Config.CustomDictionaryAutoRefreshCache && isDicNeedUpdate(path, customDicPath)) {
|
||||
return false;
|
||||
} else {
|
||||
ByteArray byteArray = ByteArray.createByteArray(path + ".bin");
|
||||
if (byteArray == null) {
|
||||
return false;
|
||||
} else {
|
||||
int size = byteArray.nextInt();
|
||||
if (size < 0) {
|
||||
while (true) {
|
||||
++size;
|
||||
if (size > 0) {
|
||||
size = byteArray.nextInt();
|
||||
break;
|
||||
}
|
||||
|
||||
Nature.create(byteArray.nextString());
|
||||
}
|
||||
}
|
||||
|
||||
CoreDictionary.Attribute[] attributes = new CoreDictionary.Attribute[size];
|
||||
Nature[] natureIndexArray = Nature.values();
|
||||
|
||||
for (int i = 0; i < size; ++i) {
|
||||
int currentTotalFrequency = byteArray.nextInt();
|
||||
int length = byteArray.nextInt();
|
||||
attributes[i] = new CoreDictionary.Attribute(length);
|
||||
attributes[i].totalFrequency = currentTotalFrequency;
|
||||
|
||||
for (int j = 0; j < length; ++j) {
|
||||
attributes[i].nature[j] = natureIndexArray[byteArray.nextInt()];
|
||||
attributes[i].frequency[j] = byteArray.nextInt();
|
||||
}
|
||||
}
|
||||
|
||||
if (!dat.load(byteArray, attributes)) {
|
||||
return false;
|
||||
} else {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (Exception var11) {
|
||||
logger.warning("读取失败,问题发生在" + TextUtility.exceptionToString(var11));
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public boolean reload() {
|
||||
if (this.path != null && this.path.length != 0) {
|
||||
IOUtil.deleteFile(this.path[0] + ".bin");
|
||||
Boolean loadCacheOk = this.loadDat(this.path[0], this.path, this.dat);
|
||||
if (!loadCacheOk) {
|
||||
return this.loadMainDictionary(this.path[0], this.path, this.dat, true, addToSuggesterTrie);
|
||||
}
|
||||
}
|
||||
return false;
|
||||
|
||||
}
|
||||
|
||||
public boolean insert(String word, String natureWithFrequency) {
|
||||
if (word == null) {
|
||||
return false;
|
||||
} else {
|
||||
if (HanLP.Config.Normalization) {
|
||||
word = CharTable.convert(word);
|
||||
}
|
||||
CoreDictionary.Attribute att = natureWithFrequency == null ? new CoreDictionary.Attribute(Nature.nz, 1)
|
||||
: CoreDictionary.Attribute.create(natureWithFrequency);
|
||||
boolean isLetters = isLetters(word);
|
||||
word = getWordBySpace(word);
|
||||
String original = null;
|
||||
if (isLetters) {
|
||||
original = word;
|
||||
word = word.toLowerCase();
|
||||
}
|
||||
if (att == null) {
|
||||
return false;
|
||||
} else if (this.dat.containsKey(word)) {
|
||||
att.original = original;
|
||||
att = DictionaryAttributeUtil.getAttribute(this.dat.get(word), att);
|
||||
this.dat.set(word, att);
|
||||
// return true;
|
||||
} else {
|
||||
if (this.trie == null) {
|
||||
this.trie = new BinTrie();
|
||||
}
|
||||
att.original = original;
|
||||
if (this.trie.containsKey(word)) {
|
||||
att = DictionaryAttributeUtil.getAttribute(this.trie.get(word), att);
|
||||
}
|
||||
this.trie.put(word, att);
|
||||
// return true;
|
||||
}
|
||||
if (addToSuggesterTrie) {
|
||||
Suggester.put(word, att);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public static boolean isLetters(String str) {
|
||||
char[] chars = str.toCharArray();
|
||||
if (chars.length <= 1) {
|
||||
return false;
|
||||
}
|
||||
for (int i = 0; i < chars.length; i++) {
|
||||
if ((chars[i] >= 'A' && chars[i] <= 'Z')) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public static String getWordBySpace(String word) {
|
||||
if (word.contains(HanlpHelper.SPACE_SPILT)) {
|
||||
return word.replace(HanlpHelper.SPACE_SPILT, " ");
|
||||
}
|
||||
return word;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,146 @@
|
||||
package com.tencent.supersonic.knowledge.infrastructure.nlp;
|
||||
|
||||
import com.hankcs.hanlp.collection.trie.bintrie.BaseNode;
|
||||
import com.hankcs.hanlp.collection.trie.bintrie.BinTrie;
|
||||
import com.hankcs.hanlp.corpus.tag.Nature;
|
||||
import com.hankcs.hanlp.dictionary.CoreDictionary;
|
||||
import com.tencent.supersonic.common.nlp.MapResult;
|
||||
import com.tencent.supersonic.common.nlp.NatureType;
|
||||
import com.tencent.supersonic.common.nlp.WordNature;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.TreeMap;
|
||||
import java.util.TreeSet;
|
||||
import java.util.stream.Collectors;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.stereotype.Service;
|
||||
import org.springframework.util.CollectionUtils;
|
||||
|
||||
@Service
|
||||
public class Suggester {
|
||||
|
||||
private static final Logger LOGGER = LoggerFactory.getLogger(Suggester.class);
|
||||
private static BinTrie<List<String>> trie;
|
||||
private static BinTrie<List<String>> suffixTrie;
|
||||
private static String localFileCache = "";
|
||||
|
||||
public static final int SEARCH_SIZE = 200;
|
||||
|
||||
static {
|
||||
trie = new BinTrie<>();
|
||||
suffixTrie = new BinTrie<>();
|
||||
}
|
||||
|
||||
/***
|
||||
* prefix Search
|
||||
* @param key
|
||||
* @return
|
||||
*/
|
||||
public static List<MapResult> prefixSearch(String key) {
|
||||
return prefixSearch(key, SEARCH_SIZE, trie);
|
||||
}
|
||||
|
||||
public static List<MapResult> prefixSearch(String key, int limit) {
|
||||
return prefixSearch(key, limit, trie);
|
||||
}
|
||||
|
||||
public static List<MapResult> prefixSearch(String key, int limit, BinTrie<List<String>> binTrie) {
|
||||
Set<Map.Entry<String, List<String>>> result = prefixSearchLimit(key, limit, binTrie);
|
||||
return result.stream().map(
|
||||
entry -> {
|
||||
String name = entry.getKey().replace("#", " ");
|
||||
return new MapResult(name, entry.getValue());
|
||||
}
|
||||
).sorted((a, b) -> -(b.getName().length() - a.getName().length()))
|
||||
.limit(SEARCH_SIZE)
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
/***
|
||||
* suffix Search
|
||||
* @param key
|
||||
* @return
|
||||
*/
|
||||
public static List<MapResult> suffixSearch(String key, int limit) {
|
||||
String reverseDetectSegment = StringUtils.reverse(key);
|
||||
return suffixSearch(reverseDetectSegment, limit, suffixTrie);
|
||||
}
|
||||
|
||||
public static List<MapResult> suffixSearch(String key, int limit, BinTrie<List<String>> binTrie) {
|
||||
Set<Map.Entry<String, List<String>>> result = prefixSearchLimit(key, limit, binTrie);
|
||||
return result.stream().map(
|
||||
entry -> {
|
||||
String name = entry.getKey().replace("#", " ");
|
||||
List<String> natures = entry.getValue().stream()
|
||||
.map(nature -> nature.replaceAll(NatureType.SUFFIX.getType(), ""))
|
||||
.collect(Collectors.toList());
|
||||
name = StringUtils.reverse(name);
|
||||
return new MapResult(name, natures);
|
||||
}
|
||||
).sorted((a, b) -> -(b.getName().length() - a.getName().length()))
|
||||
.limit(SEARCH_SIZE)
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
private static Set<Map.Entry<String, List<String>>> prefixSearchLimit(String key, int limit,
|
||||
BinTrie<List<String>> binTrie) {
|
||||
key = key.toLowerCase();
|
||||
Set<Map.Entry<String, List<String>>> entrySet = new TreeSet<Map.Entry<String, List<String>>>();
|
||||
StringBuilder sb = new StringBuilder(key.substring(0, key.length() - 1));
|
||||
BaseNode branch = binTrie;
|
||||
char[] chars = key.toCharArray();
|
||||
for (char aChar : chars) {
|
||||
if (branch == null) {
|
||||
return entrySet;
|
||||
}
|
||||
branch = branch.getChild(aChar);
|
||||
}
|
||||
|
||||
if (branch == null) {
|
||||
return entrySet;
|
||||
}
|
||||
branch.walkLimit(sb, entrySet, limit);
|
||||
return entrySet;
|
||||
}
|
||||
|
||||
public static void clear() {
|
||||
LOGGER.info("clear all trie");
|
||||
trie = new BinTrie<>();
|
||||
suffixTrie = new BinTrie<>();
|
||||
}
|
||||
|
||||
public static void put(String key, CoreDictionary.Attribute attribute) {
|
||||
trie.put(key, Arrays.stream(attribute.nature).map(entry -> entry.toString()).collect(Collectors.toList()));
|
||||
}
|
||||
|
||||
|
||||
public static void loadSuffix(List<WordNature> suffixes) {
|
||||
if (CollectionUtils.isEmpty(suffixes)) {
|
||||
return;
|
||||
}
|
||||
TreeMap<String, CoreDictionary.Attribute> map = new TreeMap();
|
||||
for (WordNature suffix : suffixes) {
|
||||
CoreDictionary.Attribute attributeNew = suffix.getNatureWithFrequency() == null
|
||||
? new CoreDictionary.Attribute(Nature.nz, 1)
|
||||
: CoreDictionary.Attribute.create(suffix.getNatureWithFrequency());
|
||||
if (map.containsKey(suffix.getWord())) {
|
||||
attributeNew = DictionaryAttributeUtil.getAttribute(map.get(suffix.getWord()), attributeNew);
|
||||
}
|
||||
map.put(suffix.getWord(), attributeNew);
|
||||
}
|
||||
for (Map.Entry<String, CoreDictionary.Attribute> stringAttributeEntry : map.entrySet()) {
|
||||
putSuffix(stringAttributeEntry.getKey(), stringAttributeEntry.getValue());
|
||||
}
|
||||
}
|
||||
|
||||
public static void putSuffix(String key, CoreDictionary.Attribute attribute) {
|
||||
suffixTrie.put(key,
|
||||
Arrays.stream(attribute.nature).map(entry -> entry.toString()).collect(Collectors.toList()));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -0,0 +1,93 @@
|
||||
package com.tencent.supersonic.knowledge.infrastructure.repository;
|
||||
|
||||
import com.tencent.supersonic.common.enums.TaskStatusEnum;
|
||||
import com.tencent.supersonic.knowledge.domain.converter.DictTaskConverter;
|
||||
import com.tencent.supersonic.knowledge.domain.dataobject.DictConfPO;
|
||||
import com.tencent.supersonic.knowledge.domain.dataobject.DimValueDictTaskPO;
|
||||
import com.tencent.supersonic.knowledge.domain.pojo.DictConfig;
|
||||
import com.tencent.supersonic.knowledge.domain.pojo.DictTaskFilter;
|
||||
import com.tencent.supersonic.knowledge.domain.pojo.DimValueDictInfo;
|
||||
import com.tencent.supersonic.knowledge.domain.repository.DictRepository;
|
||||
import com.tencent.supersonic.knowledge.infrastructure.custom.DictConfMapper;
|
||||
import com.tencent.supersonic.knowledge.infrastructure.custom.DictTaskMapper;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Date;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.concurrent.CompletableFuture;
|
||||
import org.springframework.beans.BeanUtils;
|
||||
import org.springframework.stereotype.Repository;
|
||||
import org.springframework.util.CollectionUtils;
|
||||
|
||||
|
||||
@Repository
|
||||
public class DictRepositoryImpl implements DictRepository {
|
||||
|
||||
private final DictTaskMapper dictTaskMapper;
|
||||
private final DictConfMapper dictConfMapper;
|
||||
|
||||
public DictRepositoryImpl(DictTaskMapper dictTaskMapper,
|
||||
DictConfMapper dictConfMapper) {
|
||||
this.dictTaskMapper = dictTaskMapper;
|
||||
this.dictConfMapper = dictConfMapper;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Long createDimValueDictTask(DimValueDictTaskPO dimValueDictTaskPO) {
|
||||
dictTaskMapper.createDimValueTask(dimValueDictTaskPO);
|
||||
return dimValueDictTaskPO.getId();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Boolean updateDictTaskStatus(Integer status, DimValueDictTaskPO dimValueDictTaskPO) {
|
||||
dimValueDictTaskPO.setStatus(status);
|
||||
Date createdAt = dimValueDictTaskPO.getCreatedAt();
|
||||
long elapsedMs = System.currentTimeMillis() - createdAt.getTime();
|
||||
dimValueDictTaskPO.setElapsedMs(elapsedMs);
|
||||
CompletableFuture.supplyAsync(() -> {
|
||||
dictTaskMapper.updateTaskStatus(dimValueDictTaskPO);
|
||||
return null;
|
||||
});
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<DimValueDictInfo> searchDictTaskList(DictTaskFilter filter) {
|
||||
List<DimValueDictInfo> dimValueDictDescList = new ArrayList<>();
|
||||
List<DimValueDictTaskPO> dimValueDictTaskPOList = dictTaskMapper.searchDictTaskList(filter);
|
||||
if (!CollectionUtils.isEmpty(dimValueDictTaskPOList)) {
|
||||
dimValueDictTaskPOList.stream().forEach(dictTaskPO -> {
|
||||
DimValueDictInfo dimValueDictDesc = new DimValueDictInfo();
|
||||
BeanUtils.copyProperties(dictTaskPO, dimValueDictDesc);
|
||||
dimValueDictDesc.setStatus(TaskStatusEnum.of(dictTaskPO.getStatus()));
|
||||
dimValueDictDescList.add(dimValueDictDesc);
|
||||
});
|
||||
}
|
||||
return dimValueDictDescList;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Boolean createDictConf(DictConfPO dictConfPO) {
|
||||
return dictConfMapper.createDictConf(dictConfPO);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Boolean editDictConf(DictConfPO dictConfPO) {
|
||||
return dictConfMapper.editDictConf(dictConfPO);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Boolean upsertDictInfo(DictConfPO dictConfPO) {
|
||||
return dictConfMapper.upsertDictInfo(dictConfPO);
|
||||
}
|
||||
|
||||
@Override
|
||||
public DictConfig getDictInfoByDomainId(Long domainId) {
|
||||
DictConfPO dictConfPO = dictConfMapper.getDictInfoByDomainId(domainId);
|
||||
if (Objects.isNull(dictConfPO)) {
|
||||
return null;
|
||||
}
|
||||
return DictTaskConverter.dictConfPO2Config(dictConfPO);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,2 @@
|
||||
com.tencent.supersonic.knowledge.domain.FileHandler=\
|
||||
com.tencent.supersonic.knowledge.domain.LocalFileHandler
|
||||
53
chat/knowledge/src/main/resources/mapper/DictConfMapper.xml
Normal file
53
chat/knowledge/src/main/resources/mapper/DictConfMapper.xml
Normal file
@@ -0,0 +1,53 @@
|
||||
<?xml version="1.0" encoding="UTF-8" ?>
|
||||
<!DOCTYPE mapper PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN"
|
||||
"http://mybatis.org/dtd/mybatis-3-mapper.dtd">
|
||||
|
||||
<mapper namespace="com.tencent.supersonic.knowledge.infrastructure.custom.DictConfMapper">
|
||||
|
||||
<resultMap id="DictConfPO"
|
||||
type="com.tencent.supersonic.knowledge.domain.dataobject.DictConfPO">
|
||||
<id column="id" property="id"/>
|
||||
<result column="domain_id" property="domainId"/>
|
||||
<result column="dim_value_infos" property="dimValueInfos"/>
|
||||
<result column="created_at" property="createdAt"/>
|
||||
<result column="updated_at" property="updatedAt"/>
|
||||
<result column="created_by" property="createdBy"/>
|
||||
<result column="updated_by" property="updatedBy"/>
|
||||
</resultMap>
|
||||
|
||||
<insert id="createDictConf">
|
||||
insert into s2_dictionary
|
||||
(`domain_id`, dim_value_infos, created_at, updated_at, created_by, updated_by)
|
||||
values
|
||||
(#{domainId}, #{dimValueInfos}, #{createdAt}, #{updatedAt}, #{createdBy}, #{updatedBy})
|
||||
</insert>
|
||||
|
||||
<insert id="upsertDictInfo">
|
||||
insert into s2_dictionary
|
||||
(`domain_id`, dim_value_infos, created_at, updated_at, created_by, updated_by)
|
||||
values
|
||||
(#{domainId}, #{dimValueInfos}, #{createdAt}, #{updatedAt}, #{createdBy}, #{updatedBy})
|
||||
on duplicate key update
|
||||
dim_value_infos = #{dimValueInfos},
|
||||
updated_at = #{updatedAt},
|
||||
updated_by = #{updatedBy}
|
||||
</insert>
|
||||
|
||||
<update id="editDictConf">
|
||||
update s2_dictionary
|
||||
set dim_value_infos = #{dimValueInfos},
|
||||
updated_at = #{updatedAt},
|
||||
updated_by = #{updatedBy}
|
||||
where domain_id = #{domainId}
|
||||
and status = 0
|
||||
</update>
|
||||
|
||||
<select id="getDictInfoByDomainId" resultMap="DictConfPO">
|
||||
select *
|
||||
from s2_dictionary
|
||||
where domain_id = #{domainId}
|
||||
and status = 0
|
||||
</select>
|
||||
|
||||
|
||||
</mapper>
|
||||
71
chat/knowledge/src/main/resources/mapper/DictTaskMapper.xml
Normal file
71
chat/knowledge/src/main/resources/mapper/DictTaskMapper.xml
Normal file
@@ -0,0 +1,71 @@
|
||||
<?xml version="1.0" encoding="UTF-8" ?>
|
||||
<!DOCTYPE mapper PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN"
|
||||
"http://mybatis.org/dtd/mybatis-3-mapper.dtd">
|
||||
|
||||
<mapper namespace="com.tencent.supersonic.knowledge.infrastructure.custom.DictTaskMapper">
|
||||
|
||||
<resultMap id="DimValueDictTaskPO"
|
||||
type="com.tencent.supersonic.knowledge.domain.dataobject.DimValueDictTaskPO">
|
||||
<id column="id" property="id"/>
|
||||
<result column="name" property="name"/>
|
||||
<result column="description" property="description"/>
|
||||
<result column="command" property="command"/>
|
||||
<result column="command_md5" property="commandMd5"/>
|
||||
<result column="status" property="status"/>
|
||||
<result column="created_by" property="createdBy"/>
|
||||
<result column="created_at" property="createdAt"/>
|
||||
<result column="progress" property="progress"/>
|
||||
<result column="elapsed_ms" property="elapsedMs"/>
|
||||
</resultMap>
|
||||
|
||||
<insert id="createDimValueTask">
|
||||
insert into s2_dictionary_task
|
||||
(`name`, description, command, command_md5, status, created_by, progress, elapsed_ms)
|
||||
values
|
||||
(#{name}, #{description}, #{command}, #{commandMd5}, #{status}, #{createdBy}, #{progress}, #{elapsedMs})
|
||||
</insert>
|
||||
|
||||
<update id="updateTaskStatus">
|
||||
update s2_dictionary_task
|
||||
<set>
|
||||
<if test="description != null and description !=''">
|
||||
description = #{description},
|
||||
</if>
|
||||
<if test="status != null">
|
||||
status = #{status},
|
||||
</if>
|
||||
<if test="progress != null">
|
||||
progress = #{progress},
|
||||
</if>
|
||||
<if test="elapsedMs != null">
|
||||
elapsed_ms = #{elapsedMs},
|
||||
</if>
|
||||
|
||||
</set>
|
||||
where name = #{name}
|
||||
and status = 0
|
||||
</update>
|
||||
|
||||
<select id="searchDictTaskList" resultMap="DimValueDictTaskPO">
|
||||
select *
|
||||
from s2_dictionary_task
|
||||
<where>
|
||||
<if test="id != null and id != ''">
|
||||
and id >= #{id}
|
||||
</if>
|
||||
<if test="name != null and name !=''">
|
||||
and `name` like "%"#{name}"%"
|
||||
</if>
|
||||
<if test="createdBy != null and createdBy !=''">
|
||||
and created_by = #{createdBy}
|
||||
</if>
|
||||
<if test="createdAt != null and createdAt !=''">
|
||||
and created_at >= #{createdAt}
|
||||
</if>
|
||||
<if test="status != null and status !=''">
|
||||
and status= #{status}
|
||||
</if>
|
||||
</where>
|
||||
</select>
|
||||
|
||||
</mapper>
|
||||
15
chat/knowledge/src/main/resources/sql.ddl/s2_dictionary.sql
Normal file
15
chat/knowledge/src/main/resources/sql.ddl/s2_dictionary.sql
Normal file
@@ -0,0 +1,15 @@
|
||||
CREATE TABLE IF NOT EXISTS `s2_dictionary` (
|
||||
`id` bigint(20) unsigned NOT NULL AUTO_INCREMENT,
|
||||
`item_id` bigint(20) DEFAULT NULL COMMENT '对应维度id、指标id等',
|
||||
`type` varchar(50) DEFAULT NULL COMMENT '对应维度、指标等',
|
||||
`black_list` mediumtext COMMENT '字典黑名单',
|
||||
`white_list` mediumtext COMMENT '字典白名单',
|
||||
`rule_list` mediumtext COMMENT '字典规则',
|
||||
`is_dict_Info` tinyint(1) NOT NULL DEFAULT '0' COMMENT '1-开启写入字典,0-不开启',
|
||||
`created_at` datetime NOT NULL COMMENT '创建时间',
|
||||
`updated_at` datetime NOT NULL COMMENT '更新时间',
|
||||
`created_by` varchar(100) NOT NULL COMMENT '创建人',
|
||||
`updated_by` varchar(100) DEFAULT NULL COMMENT '更新人',
|
||||
`is_deleted` tinyint(1) NOT NULL DEFAULT '0' COMMENT '1-删除,0-可用',
|
||||
PRIMARY KEY (`id`)
|
||||
) COMMENT='字典配置信息表'
|
||||
@@ -0,0 +1,11 @@
|
||||
CREATE TABLE IF NOT EXISTS `s2_dictionary_task` (
|
||||
`id` bigint(20) unsigned NOT NULL AUTO_INCREMENT,
|
||||
`name` varchar(255) NOT NULL COMMENT '任务名称',
|
||||
`description` varchar(255) NOT NULL COMMENT '任务描述',
|
||||
`command` mediumtext NOT NULL COMMENT '任务请求参数',
|
||||
`status` int(10) NOT NULL COMMENT '任务最终运行状态',
|
||||
`created_at` datetime DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
|
||||
`created_by` varchar(100) NOT NULL COMMENT '创建人',
|
||||
`elapsed_ms` bigint(10) DEFAULT NULL COMMENT '任务耗时',
|
||||
PRIMARY KEY (`id`)
|
||||
)COMMENT='字典任务信息表'
|
||||
Reference in New Issue
Block a user