first commit

This commit is contained in:
jerryjzhang
2023-06-12 18:44:01 +08:00
commit dc4fc69b57
879 changed files with 573090 additions and 0 deletions

View File

@@ -0,0 +1,319 @@
package com.hankcs.hanlp.collection.trie.bintrie;
import com.hankcs.hanlp.corpus.io.ByteArray;
import java.io.DataOutputStream;
import java.io.IOException;
import java.io.ObjectInput;
import java.io.ObjectOutput;
import java.util.AbstractMap;
import java.util.ArrayDeque;
import java.util.Arrays;
import java.util.Map;
import java.util.Queue;
import java.util.Set;
public abstract class BaseNode<V> implements Comparable<BaseNode> {
/**
* 状态数组,方便读取的时候用
*/
static final Status[] ARRAY_STATUS = Status.values();
/**
* 子节点
*/
protected BaseNode[] child;
/**
* 节点状态
*/
protected Status status;
/**
* 节点代表的字符
*/
protected char c;
/**
* 节点代表的值
*/
protected V value;
public String prefix = null;
public BaseNode<V> transition(String path, int begin) {
BaseNode<V> cur = this;
for (int i = begin; i < path.length(); ++i) {
cur = cur.getChild(path.charAt(i));
if (cur == null || cur.status == Status.UNDEFINED_0) {
return null;
}
}
return cur;
}
public BaseNode<V> transition(char[] path, int begin) {
BaseNode<V> cur = this;
for (int i = begin; i < path.length; ++i) {
cur = cur.getChild(path[i]);
if (cur == null || cur.status == Status.UNDEFINED_0) {
return null;
}
}
return cur;
}
/**
* 转移状态
*
* @param path
* @return
*/
public BaseNode<V> transition(char path) {
BaseNode<V> cur = this;
cur = cur.getChild(path);
if (cur == null || cur.status == Status.UNDEFINED_0) {
return null;
}
return cur;
}
/**
* 添加子节点
*
* @return true-新增了节点 false-修改了现有节点
*/
protected abstract boolean addChild(BaseNode node);
/**
* 是否含有子节点
*
* @param c 子节点的char
* @return 是否含有
*/
protected boolean hasChild(char c) {
return getChild(c) != null;
}
protected char getChar() {
return c;
}
/**
* 获取子节点
*
* @param c 子节点的char
* @return 子节点
*/
public abstract BaseNode getChild(char c);
/**
* 获取节点对应的值
*
* @return 值
*/
public final V getValue() {
return value;
}
/**
* 设置节点对应的值
*
* @param value 值
*/
public final void setValue(V value) {
this.value = value;
}
@Override
public int compareTo(BaseNode other) {
return compareTo(other.getChar());
}
/**
* 重载,与字符的比较
*
* @param other
* @return
*/
public int compareTo(char other) {
if (this.c > other) {
return 1;
}
if (this.c < other) {
return -1;
}
return 0;
}
/**
* 获取节点的成词状态
*
* @return
*/
public Status getStatus() {
return status;
}
protected void walk(StringBuilder sb, Set<Map.Entry<String, V>> entrySet) {
sb.append(c);
if (status == Status.WORD_MIDDLE_2 || status == Status.WORD_END_3) {
entrySet.add(new TrieEntry(sb.toString(), value));
}
if (child == null) {
return;
}
for (BaseNode node : child) {
if (node == null) {
continue;
}
node.walk(new StringBuilder(sb.toString()), entrySet);
}
}
protected void walkToSave(DataOutputStream out) throws IOException {
out.writeChar(c);
out.writeInt(status.ordinal());
int childSize = 0;
if (child != null) {
childSize = child.length;
}
out.writeInt(childSize);
if (child == null) {
return;
}
for (BaseNode node : child) {
node.walkToSave(out);
}
}
protected void walkToSave(ObjectOutput out) throws IOException {
out.writeChar(c);
out.writeInt(status.ordinal());
if (status == Status.WORD_END_3 || status == Status.WORD_MIDDLE_2) {
out.writeObject(value);
}
int childSize = 0;
if (child != null) {
childSize = child.length;
}
out.writeInt(childSize);
if (child == null) {
return;
}
for (BaseNode node : child) {
node.walkToSave(out);
}
}
protected void walkToLoad(ByteArray byteArray, _ValueArray<V> valueArray) {
c = byteArray.nextChar();
status = ARRAY_STATUS[byteArray.nextInt()];
if (status == Status.WORD_END_3 || status == Status.WORD_MIDDLE_2) {
value = valueArray.nextValue();
}
int childSize = byteArray.nextInt();
child = new BaseNode[childSize];
for (int i = 0; i < childSize; ++i) {
child[i] = new Node<V>();
child[i].walkToLoad(byteArray, valueArray);
}
}
protected void walkToLoad(ObjectInput byteArray) throws IOException, ClassNotFoundException {
c = byteArray.readChar();
status = ARRAY_STATUS[byteArray.readInt()];
if (status == Status.WORD_END_3 || status == Status.WORD_MIDDLE_2) {
value = (V) byteArray.readObject();
}
int childSize = byteArray.readInt();
child = new BaseNode[childSize];
for (int i = 0; i < childSize; ++i) {
child[i] = new Node<V>();
child[i].walkToLoad(byteArray);
}
}
public enum Status {
/**
* 未指定,用于删除词条
*/
UNDEFINED_0,
/**
* 不是词语的结尾
*/
NOT_WORD_1,
/**
* 是个词语的结尾,并且还可以继续
*/
WORD_MIDDLE_2,
/**
* 是个词语的结尾,并且没有继续
*/
WORD_END_3,
}
public class TrieEntry extends AbstractMap.SimpleEntry<String, V> implements Comparable<TrieEntry> {
public TrieEntry(String key, V value) {
super(key, value);
}
@Override
public int compareTo(TrieEntry o) {
return getKey().compareTo(String.valueOf(o.getKey()));
}
}
@Override
public String toString() {
return "BaseNode{"
+ "child="
+ Arrays.toString(child)
+ ", status="
+ status
+ ", c="
+ c
+ ", value="
+ value
+ ", prefix='"
+ prefix
+ '\''
+ '}';
}
public void walkNode(Set<Map.Entry<String, V>> entrySet) {
if (status == Status.WORD_MIDDLE_2 || status == Status.WORD_END_3) {
String name = this.prefix != null ? this.prefix + c : "" + c;
entrySet.add(new TrieEntry(name, value));
}
}
/***
* walk limit
* @param sb
* @param entrySet
* @param limit
*/
public void walkLimit(StringBuilder sb, Set<Map.Entry<String, V>> entrySet, int limit) {
Queue<BaseNode> queue = new ArrayDeque<>();
this.prefix = sb.toString();
queue.add(this);
while (!queue.isEmpty()) {
if (entrySet.size() >= limit) {
break;
}
BaseNode root = queue.poll();
if (root == null) {
continue;
}
root.walkNode(entrySet);
if (root.child == null) {
continue;
}
String prefix = root.prefix + root.c;
for (BaseNode node : root.child) {
node.prefix = prefix;
queue.add(node);
}
}
}
}

View File

@@ -0,0 +1,393 @@
package com.hankcs.hanlp.dictionary;
import static com.hankcs.hanlp.utility.Predefine.logger;
import com.hankcs.hanlp.HanLP;
import com.hankcs.hanlp.collection.trie.DoubleArrayTrie;
import com.hankcs.hanlp.corpus.io.ByteArray;
import com.hankcs.hanlp.corpus.io.IOUtil;
import com.hankcs.hanlp.corpus.tag.Nature;
import com.hankcs.hanlp.utility.Predefine;
import com.hankcs.hanlp.utility.TextUtility;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.DataOutputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Serializable;
import java.util.Collection;
import java.util.TreeMap;
/**
* 使用DoubleArrayTrie实现的核心词典
*/
public class CoreDictionary {
public static DoubleArrayTrie<Attribute> trie = new DoubleArrayTrie<Attribute>();
public static final String PATH = HanLP.Config.CoreDictionaryPath;
// 自动加载词典
static {
long start = System.currentTimeMillis();
if (!load(PATH)) {
throw new IllegalArgumentException("核心词典" + PATH + "加载失败");
} else {
logger.info(PATH + "加载成功," + trie.size() + "个词条,耗时" + (System.currentTimeMillis() - start) + "ms");
}
}
// 一些特殊的WORD_ID
public static final int NR_WORD_ID = getWordID(Predefine.TAG_PEOPLE);
public static final int NS_WORD_ID = getWordID(Predefine.TAG_PLACE);
public static final int NT_WORD_ID = getWordID(Predefine.TAG_GROUP);
public static final int T_WORD_ID = getWordID(Predefine.TAG_TIME);
public static final int X_WORD_ID = getWordID(Predefine.TAG_CLUSTER);
public static final int M_WORD_ID = getWordID(Predefine.TAG_NUMBER);
public static final int NX_WORD_ID = getWordID(Predefine.TAG_PROPER);
private static boolean load(String path) {
logger.info("核心词典开始加载:" + path);
if (loadDat(path)) {
return true;
}
TreeMap<String, Attribute> map = new TreeMap<String, Attribute>();
BufferedReader br = null;
try {
br = new BufferedReader(new InputStreamReader(IOUtil.newInputStream(path), "UTF-8"));
String line;
int totalFrequency = 0;
long start = System.currentTimeMillis();
while ((line = br.readLine()) != null) {
String[] param = line.split("\\s");
int natureCount = (param.length - 1) / 2;
Attribute attribute = new Attribute(natureCount);
for (int i = 0; i < natureCount; ++i) {
attribute.nature[i] = Nature.create(param[1 + 2 * i]);
attribute.frequency[i] = Integer.parseInt(param[2 + 2 * i]);
attribute.totalFrequency += attribute.frequency[i];
}
map.put(param[0], attribute);
totalFrequency += attribute.totalFrequency;
}
logger.info(
"核心词典读入词条" + map.size() + " 全部频次" + totalFrequency + ",耗时" + (System.currentTimeMillis() - start)
+ "ms");
br.close();
trie.build(map);
logger.info("核心词典加载成功:" + trie.size() + "个词条,下面将写入缓存……");
try {
DataOutputStream out = new DataOutputStream(
new BufferedOutputStream(IOUtil.newOutputStream(path + Predefine.BIN_EXT)));
Collection<Attribute> attributeList = map.values();
out.writeInt(attributeList.size());
for (Attribute attribute : attributeList) {
out.writeInt(attribute.totalFrequency);
out.writeInt(attribute.nature.length);
for (int i = 0; i < attribute.nature.length; ++i) {
out.writeInt(attribute.nature[i].ordinal());
out.writeInt(attribute.frequency[i]);
}
}
trie.save(out);
out.writeInt(totalFrequency);
Predefine.setTotalFrequency(totalFrequency);
out.close();
} catch (Exception e) {
logger.warning("保存失败" + e);
return false;
}
} catch (FileNotFoundException e) {
logger.warning("核心词典" + path + "不存在!" + e);
return false;
} catch (IOException e) {
logger.warning("核心词典" + path + "读取错误!" + e);
return false;
}
return true;
}
/**
* 从磁盘加载双数组
*
* @param path
* @return
*/
static boolean loadDat(String path) {
try {
ByteArray byteArray = ByteArray.createByteArray(path + Predefine.BIN_EXT);
if (byteArray == null) {
return false;
}
int size = byteArray.nextInt();
Attribute[] attributes = new Attribute[size];
final Nature[] natureIndexArray = Nature.values();
for (int i = 0; i < size; ++i) {
// 第一个是全部频次,第二个是词性个数
int currentTotalFrequency = byteArray.nextInt();
int length = byteArray.nextInt();
attributes[i] = new Attribute(length);
attributes[i].totalFrequency = currentTotalFrequency;
for (int j = 0; j < length; ++j) {
attributes[i].nature[j] = natureIndexArray[byteArray.nextInt()];
attributes[i].frequency[j] = byteArray.nextInt();
}
}
if (!trie.load(byteArray, attributes)) {
return false;
}
int totalFrequency = 0;
if (byteArray.hasMore()) {
totalFrequency = byteArray.nextInt();
} else {
for (Attribute attribute : attributes) {
totalFrequency += attribute.totalFrequency;
}
}
Predefine.setTotalFrequency(totalFrequency);
} catch (Exception e) {
logger.warning("读取失败,问题发生在" + e);
return false;
}
return true;
}
/**
* 获取条目
*
* @param key
* @return
*/
public static Attribute get(String key) {
return trie.get(key);
}
/**
* 获取条目
*
* @param wordID
* @return
*/
public static Attribute get(int wordID) {
return trie.get(wordID);
}
/**
* 获取词频
*
* @param term
* @return
*/
public static int getTermFrequency(String term) {
Attribute attribute = get(term);
if (attribute == null) {
return 0;
}
return attribute.totalFrequency;
}
/**
* 是否包含词语
*
* @param key
* @return
*/
public static boolean contains(String key) {
return trie.get(key) != null;
}
/**
* 核心词典中的词属性
*/
public static class Attribute implements Serializable {
/**
* 词性列表
*/
public Nature[] nature;
/**
* 词性对应的词频
*/
public int[] frequency;
public int totalFrequency;
public String original = null;
public Attribute(int size) {
nature = new Nature[size];
frequency = new int[size];
}
public Attribute(Nature[] nature, int[] frequency) {
this.nature = nature;
this.frequency = frequency;
}
public Attribute(Nature nature, int frequency) {
this(1);
this.nature[0] = nature;
this.frequency[0] = frequency;
totalFrequency = frequency;
}
public Attribute(Nature[] nature, int[] frequency, int totalFrequency) {
this.nature = nature;
this.frequency = frequency;
this.totalFrequency = totalFrequency;
}
/**
* 使用单个词性默认词频1000构造
*
* @param nature
*/
public Attribute(Nature nature) {
this(nature, 1000);
}
public static Attribute create(String natureWithFrequency) {
try {
String[] param = natureWithFrequency.split(" ");
if (param.length % 2 != 0) {
return new Attribute(Nature.create(natureWithFrequency.trim()), 1); // 儿童锁
}
int natureCount = param.length / 2;
Attribute attribute = new Attribute(natureCount);
for (int i = 0; i < natureCount; ++i) {
attribute.nature[i] = Nature.create(param[2 * i]);
attribute.frequency[i] = Integer.parseInt(param[1 + 2 * i]);
attribute.totalFrequency += attribute.frequency[i];
}
return attribute;
} catch (Exception e) {
logger.warning("使用字符串" + natureWithFrequency + "创建词条属性失败!" + TextUtility.exceptionToString(e));
return null;
}
}
/**
* 从字节流中加载
*
* @param byteArray
* @param natureIndexArray
* @return
*/
public static Attribute create(ByteArray byteArray, Nature[] natureIndexArray) {
int currentTotalFrequency = byteArray.nextInt();
int length = byteArray.nextInt();
Attribute attribute = new Attribute(length);
attribute.totalFrequency = currentTotalFrequency;
for (int j = 0; j < length; ++j) {
attribute.nature[j] = natureIndexArray[byteArray.nextInt()];
attribute.frequency[j] = byteArray.nextInt();
}
return attribute;
}
/**
* 获取词性的词频
*
* @param nature 字符串词性
* @return 词频
* @deprecated 推荐使用Nature参数
*/
public int getNatureFrequency(String nature) {
try {
Nature pos = Nature.create(nature);
return getNatureFrequency(pos);
} catch (IllegalArgumentException e) {
return 0;
}
}
/**
* 获取词性的词频
*
* @param nature 词性
* @return 词频
*/
public int getNatureFrequency(final Nature nature) {
int i = 0;
for (Nature pos : this.nature) {
if (nature == pos) {
return frequency[i];
}
++i;
}
return 0;
}
/**
* 是否有某个词性
*
* @param nature
* @return
*/
public boolean hasNature(Nature nature) {
return getNatureFrequency(nature) > 0;
}
/**
* 是否有以某个前缀开头的词性
*
* @param prefix 词性前缀比如u会查询是否有ude, uzhe等等
* @return
*/
public boolean hasNatureStartsWith(String prefix) {
for (Nature n : nature) {
if (n.startsWith(prefix)) {
return true;
}
}
return false;
}
@Override
public String toString() {
final StringBuilder sb = new StringBuilder();
for (int i = 0; i < nature.length; ++i) {
sb.append(nature[i]).append(' ').append(frequency[i]).append(' ');
}
return sb.toString();
}
public void save(DataOutputStream out) throws IOException {
out.writeInt(totalFrequency);
out.writeInt(nature.length);
for (int i = 0; i < nature.length; ++i) {
out.writeInt(nature[i].ordinal());
out.writeInt(frequency[i]);
}
}
}
/**
* 获取词语的ID
*
* @param a 词语
* @return ID, 如果不存在, 则返回-1
*/
public static int getWordID(String a) {
return CoreDictionary.trie.exactMatchSearch(a);
}
/**
* 热更新核心词典<br>
* 集群环境或其他IOAdapter需要自行删除缓存文件
*
* @return 是否成功
*/
public static boolean reload() {
String path = CoreDictionary.PATH;
IOUtil.deleteFile(path + Predefine.BIN_EXT);
return load(path);
}
}

View File

@@ -0,0 +1,69 @@
package com.hankcs.hanlp.seg.common;
import com.hankcs.hanlp.corpus.tag.Nature;
import com.hankcs.hanlp.dictionary.CoreDictionary;
import com.hankcs.hanlp.dictionary.CustomDictionary;
import com.tencent.supersonic.knowledge.infrastructure.nlp.HanlpHelper;
import lombok.Data;
import lombok.ToString;
@Data
@ToString
public class Term {
public String word;
public Nature nature;
public int offset;
public int frequency = 0;
public Term(String word, Nature nature) {
this.word = word;
this.nature = nature;
}
public Term(String word, Nature nature, int offset) {
this.word = word;
this.nature = nature;
this.offset = offset;
}
public Term(String word, Nature nature, int offset, int frequency) {
this.word = word;
this.nature = nature;
this.offset = offset;
this.frequency = frequency;
}
public int length() {
return this.word.length();
}
public int getFrequency() {
if (frequency > 0) {
return frequency;
}
String wordOri = word.toLowerCase();
CoreDictionary.Attribute attribute = HanlpHelper.getDynamicCustomDictionary().get(wordOri);
if (attribute == null) {
attribute = CoreDictionary.get(wordOri);
if (attribute == null) {
attribute = CustomDictionary.get(wordOri);
}
}
if (attribute != null && nature != null && attribute.hasNature(nature)) {
return attribute.getNatureFrequency(nature);
}
return attribute == null ? 0 : attribute.totalFrequency;
}
public boolean equals(Object obj) {
if (obj instanceof Term) {
Term term = (Term) obj;
if (this.nature == term.nature && this.word.equals(term.word)) {
return true;
}
}
return super.equals(obj);
}
}

View File

@@ -0,0 +1,57 @@
package com.tencent.supersonic.knowledge.application.online;
import com.tencent.supersonic.common.nlp.ItemDO;
import com.tencent.supersonic.common.nlp.NatureType;
import com.tencent.supersonic.common.nlp.WordNature;
import java.util.ArrayList;
import java.util.List;
import lombok.extern.slf4j.Slf4j;
/**
* base word nature
*/
@Slf4j
public abstract class BaseWordNature {
/**
* 获取所有wordNature
*
* @param itemDOS
* @return
*/
public List<WordNature> getWordNatureList(List<ItemDO> itemDOS) {
List<WordNature> wordNatures = new ArrayList<>();
try {
wordNatures = getWordNaturesWithException(itemDOS);
} catch (Exception e) {
log.error("getWordNatureList error,", e);
}
return wordNatures;
}
public List<WordNature> getWordNaturesWithException(List<ItemDO> itemDOS) {
List<WordNature> wordNatures = new ArrayList<>();
for (ItemDO itemDO : itemDOS) {
wordNatures.addAll(getWordNature(itemDO.getName(), itemDO));
}
return wordNatures;
}
public abstract List<WordNature> getWordNature(String word, ItemDO itemDO);
public Integer getElementID(String nature) {
String[] split = nature.split(NatureType.NATURE_SPILT);
if (split.length >= 3) {
return Integer.valueOf(split[2]);
}
return 0;
}
public static Integer getDomain(String nature) {
String[] split = nature.split(NatureType.NATURE_SPILT);
return Integer.valueOf(split[1]);
}
}

View File

@@ -0,0 +1,49 @@
package com.tencent.supersonic.knowledge.application.online;
import com.google.common.collect.Lists;
import com.tencent.supersonic.common.nlp.ItemDO;
import com.tencent.supersonic.common.nlp.NatureType;
import com.tencent.supersonic.common.nlp.WordNature;
import java.util.List;
import org.apache.commons.lang3.StringUtils;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Service;
/**
* dimension word nature
*/
@Service
public class DimensionWordNature extends BaseWordNature {
@Value("${nlp.dimension.use.suffix:true}")
private boolean nlpDimensionUseSuffix = true;
@Override
public List<WordNature> getWordNature(String word, ItemDO itemDO) {
List<WordNature> result = Lists.newArrayList();
result.add(getOnwWordNature(word, itemDO, false));
if (nlpDimensionUseSuffix) {
String reverseWord = StringUtils.reverse(word);
if (StringUtils.isNotEmpty(word) && !word.equalsIgnoreCase(reverseWord)) {
result.add(getOnwWordNature(reverseWord, itemDO, true));
}
}
return result;
}
private WordNature getOnwWordNature(String word, ItemDO itemDO, boolean isSuffix) {
WordNature wordNature = new WordNature();
wordNature.setWord(word);
Integer classId = itemDO.getDomain();
String nature = NatureType.NATURE_SPILT + classId + NatureType.NATURE_SPILT + itemDO.getItemId()
+ NatureType.DIMENSION.getType();
if (isSuffix) {
nature = NatureType.NATURE_SPILT + classId + NatureType.NATURE_SPILT + itemDO.getItemId()
+ NatureType.SUFFIX.getType() + NatureType.DIMENSION.getType();
}
wordNature.setNatureWithFrequency(String.format("%s 100000", nature));
return wordNature;
}
}

View File

@@ -0,0 +1,29 @@
package com.tencent.supersonic.knowledge.application.online;
import com.google.common.collect.Lists;
import com.tencent.supersonic.common.nlp.ItemDO;
import com.tencent.supersonic.common.nlp.NatureType;
import com.tencent.supersonic.common.nlp.WordNature;
import java.util.List;
import lombok.extern.slf4j.Slf4j;
import org.springframework.stereotype.Service;
/**
* domain word nature
*/
@Service
@Slf4j
public class DomainWordNature extends BaseWordNature {
@Override
public List<WordNature> getWordNature(String word, ItemDO itemDO) {
List<WordNature> result = Lists.newArrayList();
WordNature wordNature = new WordNature();
wordNature.setWord(word);
Integer classId = itemDO.getDomain();
String nature = NatureType.NATURE_SPILT + classId;
wordNature.setNatureWithFrequency(String.format("%s 100000", nature));
result.add(wordNature);
return result;
}
}

View File

@@ -0,0 +1,31 @@
package com.tencent.supersonic.knowledge.application.online;
import com.google.common.collect.Lists;
import com.tencent.supersonic.common.nlp.ItemDO;
import com.tencent.supersonic.common.nlp.NatureType;
import com.tencent.supersonic.common.nlp.WordNature;
import java.util.List;
import lombok.extern.slf4j.Slf4j;
import org.springframework.stereotype.Service;
/**
* dimension value wordNature
*/
@Service
@Slf4j
public class EntityWordNature extends BaseWordNature {
@Override
public List<WordNature> getWordNature(String word, ItemDO itemDO) {
List<WordNature> result = Lists.newArrayList();
WordNature wordNature = new WordNature();
wordNature.setWord(word);
Integer domain = itemDO.getDomain();
String nature = NatureType.NATURE_SPILT + domain + NatureType.NATURE_SPILT + itemDO.getItemId()
+ NatureType.ENTITY.getType();
wordNature.setNatureWithFrequency(String.format("%s 200000", nature));
result.add(wordNature);
return result;
}
}

View File

@@ -0,0 +1,48 @@
package com.tencent.supersonic.knowledge.application.online;
import com.google.common.collect.Lists;
import com.tencent.supersonic.common.nlp.ItemDO;
import com.tencent.supersonic.common.nlp.NatureType;
import com.tencent.supersonic.common.nlp.WordNature;
import java.util.List;
import org.apache.commons.lang3.StringUtils;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Service;
/**
* Metric WordNature
*/
@Service
public class MetricWordNature extends BaseWordNature {
@Value("${nlp.metric.use.suffix:true}")
private boolean nlpMetricUseSuffix = true;
@Override
public List<WordNature> getWordNature(String word, ItemDO itemDO) {
List<WordNature> result = Lists.newArrayList();
result.add(getOnwWordNature(word, itemDO, false));
if (nlpMetricUseSuffix) {
String reverseWord = StringUtils.reverse(word);
if (!word.equalsIgnoreCase(reverseWord)) {
result.add(getOnwWordNature(reverseWord, itemDO, true));
}
}
return result;
}
private WordNature getOnwWordNature(String word, ItemDO itemDO, boolean isSuffix) {
WordNature wordNature = new WordNature();
wordNature.setWord(word);
Integer classId = itemDO.getDomain();
String nature = NatureType.NATURE_SPILT + classId + NatureType.NATURE_SPILT + itemDO.getItemId()
+ NatureType.METRIC.getType();
if (isSuffix) {
nature = NatureType.NATURE_SPILT + classId + NatureType.NATURE_SPILT + itemDO.getItemId()
+ NatureType.SUFFIX.getType() + NatureType.METRIC.getType();
}
wordNature.setNatureWithFrequency(String.format("%s 100000", nature));
return wordNature;
}
}

View File

@@ -0,0 +1,60 @@
package com.tencent.supersonic.knowledge.application.online;
import com.tencent.supersonic.common.nlp.NatureType;
import com.tencent.supersonic.common.nlp.WordNature;
import com.tencent.supersonic.knowledge.domain.service.OnlineKnowledgeService;
import com.tencent.supersonic.knowledge.infrastructure.nlp.HanlpHelper;
import com.tencent.supersonic.knowledge.infrastructure.nlp.Suggester;
import java.util.List;
import java.util.stream.Collectors;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.stereotype.Service;
/**
* online knowledge service impl
*/
@Service
public class OnlineKnowledgeServiceImpl implements OnlineKnowledgeService {
private final Logger logger = LoggerFactory.getLogger(OnlineKnowledgeServiceImpl.class);
public void updateSemanticKnowledge(List<WordNature> natures) {
List<WordNature> prefixes = natures.stream()
.filter(entry -> !entry.getNatureWithFrequency().contains(NatureType.SUFFIX.getType()))
.collect(Collectors.toList());
for (WordNature nature : prefixes) {
HanlpHelper.addToCustomDictionary(nature);
}
List<WordNature> suffixes = natures.stream()
.filter(entry -> entry.getNatureWithFrequency().contains(NatureType.SUFFIX.getType()))
.collect(Collectors.toList());
Suggester.loadSuffix(suffixes);
}
public void reloadAllData(List<WordNature> natures) {
// 1. reload custom knowledge
try {
HanlpHelper.reloadCustomDictionary();
} catch (Exception e) {
logger.error("reloadCustomDictionary error", e);
}
// 2. update online knowledge
updateOnlineKnowledge(natures);
}
public void updateOnlineKnowledge(List<WordNature> natures) {
try {
updateSemanticKnowledge(natures);
} catch (Exception e) {
logger.error("updateSemanticKnowledge error", e);
}
}
}

View File

@@ -0,0 +1,30 @@
package com.tencent.supersonic.knowledge.application.online;
import com.google.common.collect.Lists;
import com.tencent.supersonic.common.nlp.ItemDO;
import com.tencent.supersonic.common.nlp.NatureType;
import com.tencent.supersonic.common.nlp.WordNature;
import java.util.List;
import lombok.extern.slf4j.Slf4j;
import org.springframework.stereotype.Service;
/**
* dimension value wordNature
*/
@Service
@Slf4j
public class ValueWordNature extends BaseWordNature {
@Override
public List<WordNature> getWordNature(String word, ItemDO itemDO) {
List<WordNature> result = Lists.newArrayList();
WordNature wordNature = new WordNature();
wordNature.setWord(word);
Integer domain = itemDO.getDomain();
String nature = NatureType.NATURE_SPILT + domain + NatureType.NATURE_SPILT + itemDO.getItemId();
wordNature.setNatureWithFrequency(String.format("%s 100000", nature));
result.add(wordNature);
return result;
}
}

View File

@@ -0,0 +1,28 @@
package com.tencent.supersonic.knowledge.application.online;
import com.tencent.supersonic.common.nlp.NatureType;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
/**
* WordNature Strategy Factory
*/
public class WordNatureStrategyFactory {
private static Map<NatureType, BaseWordNature> strategyFactory = new ConcurrentHashMap<>();
static {
strategyFactory.put(NatureType.DIMENSION, new DimensionWordNature());
strategyFactory.put(NatureType.METRIC, new MetricWordNature());
strategyFactory.put(NatureType.DOMAIN, new DomainWordNature());
strategyFactory.put(NatureType.ENTITY, new EntityWordNature());
strategyFactory.put(NatureType.VALUE, new ValueWordNature());
}
public static BaseWordNature get(NatureType strategyType) {
return strategyFactory.get(strategyType);
}
}

View File

@@ -0,0 +1,66 @@
package com.tencent.supersonic.knowledge.domain;
import java.util.List;
public interface FileHandler {
/**
* backup files to a specific directory
* config: dict.directory.backup
*
* @param fileName
*/
void backupFile(String fileName);
/**
* move files to a specific directory
* not backup
*
* @param fileName
* @param targetDirectory
*/
void moveFile(String fileName, String targetDirectory);
/**
* create a directory
*
* @param path
*/
void createDir(String path);
Boolean existPath(String path);
/**
* write data to a specific file,
* config dir: dict.directory.latest
*
* @param data
* @param fileName
* @param append
*/
void writeFile(List<String> data, String fileName, Boolean append);
/**
* get the knowledge file root directory
*
* @return
*/
String getDictRootPath();
/**
* delete dictionary file
* automatic backup
*
* @param fileName
* @return
*/
Boolean deleteDictFile(String fileName);
/**
* delete files directly without backup
*
* @param fileName
*/
void deleteFile(String fileName);
}

View File

@@ -0,0 +1,40 @@
package com.tencent.supersonic.knowledge.domain;
import com.tencent.supersonic.knowledge.infrastructure.nlp.HanlpHelper;
import java.io.FileNotFoundException;
import lombok.Data;
import lombok.extern.slf4j.Slf4j;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.context.annotation.Configuration;
@Data
@Configuration
@Slf4j
public class LocalFileConfig {
@Value("${dict.directory.latest:/data/dictionary/custom}")
private String dictDirectoryLatest;
@Value("${dict.directory.backup:./dict/backup}")
private String dictDirectoryBackup;
public String getDictDirectoryLatest() {
return getResourceDir() + dictDirectoryLatest;
}
public String getDictDirectoryBackup() {
return dictDirectoryBackup;
}
private String getResourceDir() {
String hanlpPropertiesPath = "";
try {
hanlpPropertiesPath = HanlpHelper.getHanlpPropertiesPath();
} catch (FileNotFoundException e) {
log.warn("getResourceDir, e:", e);
}
return hanlpPropertiesPath;
}
}

View File

@@ -0,0 +1,139 @@
package com.tencent.supersonic.knowledge.domain;
import java.io.BufferedWriter;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.StandardCopyOption;
import java.nio.file.StandardOpenOption;
import java.util.List;
import lombok.extern.slf4j.Slf4j;
import org.springframework.stereotype.Component;
import org.springframework.util.CollectionUtils;
@Slf4j
@Component
public class LocalFileHandler implements FileHandler {
private final LocalFileConfig localFileConfig;
public LocalFileHandler(LocalFileConfig localFileConfig) {
this.localFileConfig = localFileConfig;
}
@Override
public void backupFile(String fileName) {
String dictDirectoryBackup = localFileConfig.getDictDirectoryBackup();
if (!existPath(dictDirectoryBackup)) {
createDir(dictDirectoryBackup);
}
String source = localFileConfig.getDictDirectoryLatest() + "/" + fileName;
String target = dictDirectoryBackup + "/" + fileName;
Path sourcePath = Paths.get(source);
Path targetPath = Paths.get(target);
try {
Files.copy(sourcePath, targetPath, StandardCopyOption.REPLACE_EXISTING);
log.info("File copied successfully!");
} catch (IOException e) {
log.info("Failed to copy file: " + e.getMessage());
}
}
@Override
public void moveFile(String filePath, String targetDirectoryPath) {
Path sourcePath = Paths.get(filePath);
Path targetPath = Paths.get(targetDirectoryPath, sourcePath.getFileName().toString());
try {
Files.move(sourcePath, targetPath, StandardCopyOption.REPLACE_EXISTING);
log.info("File moved successfully!");
} catch (IOException e) {
log.info("Failed to move file: " + e.getMessage());
}
}
@Override
public void createDir(String directoryPath) {
Path path = Paths.get(directoryPath);
try {
Files.createDirectories(path);
log.info("Directory created successfully!");
} catch (IOException e) {
log.info("Failed to create directory: " + e.getMessage());
}
}
@Override
public void deleteFile(String filePath) {
Path path = Paths.get(filePath);
try {
Files.delete(path);
log.info("File:{} deleted successfully!", getAbsolutePath(filePath));
} catch (IOException e) {
log.info("Failed to delete file:{}, e:", getAbsolutePath(filePath), e);
}
}
@Override
public Boolean existPath(String pathStr) {
Path path = Paths.get(pathStr);
if (Files.exists(path)) {
log.info("path:{} exists!", getAbsolutePath(pathStr));
return true;
} else {
log.info("path:{} not exists!", getAbsolutePath(pathStr));
}
return false;
}
@Override
public void writeFile(List<String> lines, String fileName, Boolean append) {
String dictDirectoryLatest = localFileConfig.getDictDirectoryLatest();
if (!existPath(dictDirectoryLatest)) {
createDir(dictDirectoryLatest);
}
String filePath = dictDirectoryLatest + "/" + fileName;
if (existPath(filePath)) {
backupFile(fileName);
}
try (BufferedWriter writer = getWriter(filePath, append)) {
if (!CollectionUtils.isEmpty(lines)) {
for (String line : lines) {
writer.write(line);
writer.newLine();
}
}
log.info("File:{} written successfully!", getAbsolutePath(filePath));
} catch (IOException e) {
log.info("Failed to write file:{}, e:", getAbsolutePath(filePath), e);
}
}
public String getAbsolutePath(String path) {
return Paths.get(path).toAbsolutePath().toString();
}
@Override
public String getDictRootPath() {
return Paths.get(localFileConfig.getDictDirectoryLatest()).toAbsolutePath().toString();
}
@Override
public Boolean deleteDictFile(String fileName) {
backupFile(fileName);
deleteFile(localFileConfig.getDictDirectoryLatest() + "/" + fileName);
return true;
}
private BufferedWriter getWriter(String filePath, Boolean append) throws IOException {
if (append) {
return Files.newBufferedWriter(Paths.get(filePath), StandardCharsets.UTF_8, StandardOpenOption.APPEND);
}
return Files.newBufferedWriter(Paths.get(filePath), StandardCharsets.UTF_8);
}
}

View File

@@ -0,0 +1,59 @@
package com.tencent.supersonic.knowledge.domain.converter;
import com.google.common.base.Strings;
import com.tencent.supersonic.auth.api.authentication.pojo.User;
import com.tencent.supersonic.common.enums.TaskStatusEnum;
import com.tencent.supersonic.common.util.json.JsonUtil;
import com.tencent.supersonic.knowledge.domain.dataobject.DictConfPO;
import com.tencent.supersonic.knowledge.domain.dataobject.DimValueDictTaskPO;
import com.tencent.supersonic.knowledge.domain.pojo.DictConfig;
import com.tencent.supersonic.knowledge.domain.pojo.DimValue2DictCommand;
import com.tencent.supersonic.knowledge.domain.pojo.DimValueInfo;
import java.time.ZoneId;
import java.time.format.DateTimeFormatter;
import java.util.Date;
import java.util.List;
public class DictTaskConverter {
private static String dateTimeFormatter = "yyyyMMddHHmmss";
public static DimValueDictTaskPO generateDimValueDictTaskPO(DimValue2DictCommand dimValue2DictCommend, User user) {
DimValueDictTaskPO taskPO = new DimValueDictTaskPO();
Date createAt = new Date();
String date = DateTimeFormatter.ofPattern(dateTimeFormatter)
.format(createAt.toInstant().atZone(ZoneId.systemDefault()).toLocalDateTime());
String creator = Strings.isNullOrEmpty(user.getName()) ? "" : user.getName();
String updateMode = dimValue2DictCommend.getUpdateMode().getValue();
String name = String.format("DimValue_dic_%s_%s_%s", updateMode, creator, date);
taskPO.setName(name);
taskPO.setCreatedAt(createAt);
taskPO.setCommand(JsonUtil.toString(dimValue2DictCommend));
taskPO.setStatus(TaskStatusEnum.RUNNING.getCode());
taskPO.setCreatedBy(creator);
return taskPO;
}
public static DictConfPO generateDictConfPO(DictConfig dictConfig, User user) {
DictConfPO dictConfPO = new DictConfPO();
dictConfPO.setDimValueInfos(JsonUtil.toString(dictConfig.getDimValueInfoList()));
dictConfPO.setDomainId(dictConfig.getDomainId());
dictConfPO.setCreatedBy(user.getName());
dictConfPO.setUpdatedBy(user.getName());
dictConfPO.setCreatedAt(new Date());
dictConfPO.setUpdatedAt(new Date());
return dictConfPO;
}
public static DictConfig dictConfPO2Config(DictConfPO dictConfPO) {
DictConfig dictConfig = new DictConfig();
dictConfig.setDomainId(dictConfPO.getDomainId());
List<DimValueInfo> dimValueInfos = JsonUtil.toList(dictConfPO.getDimValueInfos(), DimValueInfo.class);
dictConfig.setDimValueInfoList(dimValueInfos);
return dictConfig;
}
}

View File

@@ -0,0 +1,20 @@
package com.tencent.supersonic.knowledge.domain.dataobject;
import java.util.Date;
import lombok.Data;
@Data
public class DictConfPO {
private Long id;
private Long domainId;
private String dimValueInfos;
private String createdBy;
private String updatedBy;
private Date createdAt;
private Date updatedAt;
}

View File

@@ -0,0 +1,35 @@
package com.tencent.supersonic.knowledge.domain.dataobject;
import java.util.Date;
import lombok.Data;
import lombok.ToString;
import org.apache.commons.codec.digest.DigestUtils;
@Data
@ToString
public class DimValueDictTaskPO {
private Long id;
private String name;
private String description;
private String command;
private String commandMd5;
private Integer status;
private String createdBy;
private Date createdAt;
private Double progress;
private Long elapsedMs;
public String getCommandMd5() {
return DigestUtils.md5Hex(command);
}
}

View File

@@ -0,0 +1,13 @@
package com.tencent.supersonic.knowledge.domain.pojo;
import java.util.List;
import lombok.Data;
@Data
public class DictConfig {
private Long domainId;
private List<DimValueInfo> dimValueInfoList;
}

View File

@@ -0,0 +1,15 @@
package com.tencent.supersonic.knowledge.domain.pojo;
public class DictTaskFilter {
private Long id;
private String name;
private String createdBy;
private String createdAt;
private Integer status;
}

View File

@@ -0,0 +1,30 @@
package com.tencent.supersonic.knowledge.domain.pojo;
public enum DictUpdateMode {
OFFLINE_FULL("OFFLINE_FULL"),
OFFLINE_DOMAIN("OFFLINE_DOMAIN"),
REALTIME_ADD("REALTIME_ADD"),
REALTIME_DELETE("REALTIME_DELETE"),
NOT_SUPPORT("NOT_SUPPORT");
private String value;
DictUpdateMode(String value) {
this.value = value;
}
public static DictUpdateMode of(String value) {
for (DictUpdateMode item : DictUpdateMode.values()) {
if (item.value.equalsIgnoreCase(value)) {
return item;
}
}
return DictUpdateMode.NOT_SUPPORT;
}
public String getValue() {
return value;
}
}

View File

@@ -0,0 +1,17 @@
package com.tencent.supersonic.knowledge.domain.pojo;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import lombok.Data;
@Data
public class DimValue2DictCommand {
private DictUpdateMode updateMode;
private List<Long> domainIds;
private Map<Long, List<Long>> domainAndDimPair = new HashMap<>();
}

View File

@@ -0,0 +1,26 @@
package com.tencent.supersonic.knowledge.domain.pojo;
import com.tencent.supersonic.common.enums.TaskStatusEnum;
import java.util.Date;
import lombok.Data;
@Data
public class DimValueDictInfo {
private Long id;
private String name;
private String description;
private String command;
private TaskStatusEnum status;
private String createdBy;
private Date createdAt;
private Long elapsedMs;
}

View File

@@ -0,0 +1,26 @@
package com.tencent.supersonic.knowledge.domain.pojo;
import com.tencent.supersonic.common.enums.TypeEnums;
import java.util.List;
import javax.validation.constraints.NotNull;
public class DimValueInfo {
/**
* metricId、DimensionId、domainId
*/
private Long itemId;
/**
* type: IntentionTypeEnum
* temporarily only supports dimension-related information
*/
@NotNull
private TypeEnums type = TypeEnums.DIMENSION;
private List<String> blackList;
private List<String> whiteList;
private List<String> ruleList;
private Boolean isDictInfo;
}

View File

@@ -0,0 +1,25 @@
package com.tencent.supersonic.knowledge.domain.repository;
import com.tencent.supersonic.knowledge.domain.dataobject.DictConfPO;
import com.tencent.supersonic.knowledge.domain.dataobject.DimValueDictTaskPO;
import com.tencent.supersonic.knowledge.domain.pojo.DictConfig;
import com.tencent.supersonic.knowledge.domain.pojo.DictTaskFilter;
import com.tencent.supersonic.knowledge.domain.pojo.DimValueDictInfo;
import java.util.List;
public interface DictRepository {
Long createDimValueDictTask(DimValueDictTaskPO dimValueDictTaskPO);
Boolean updateDictTaskStatus(Integer status, DimValueDictTaskPO dimValueDictTaskPO);
List<DimValueDictInfo> searchDictTaskList(DictTaskFilter filter);
Boolean createDictConf(DictConfPO dictConfPO);
Boolean editDictConf(DictConfPO dictConfPO);
Boolean upsertDictInfo(DictConfPO dictConfPO);
DictConfig getDictInfoByDomainId(Long domainId);
}

View File

@@ -0,0 +1,17 @@
package com.tencent.supersonic.knowledge.domain.service;
import com.tencent.supersonic.common.nlp.WordNature;
import java.util.List;
/**
* online knowledge service interface
*/
public interface OnlineKnowledgeService {
void updateSemanticKnowledge(List<WordNature> natures);
void reloadAllData(List<WordNature> natures);
void updateOnlineKnowledge(List<WordNature> natures);
}

View File

@@ -0,0 +1,17 @@
package com.tencent.supersonic.knowledge.infrastructure.custom;
import com.tencent.supersonic.knowledge.domain.dataobject.DictConfPO;
import org.apache.ibatis.annotations.Mapper;
@Mapper
public interface DictConfMapper {
Boolean createDictConf(DictConfPO dictConfPO);
Boolean editDictConf(DictConfPO dictConfPO);
Boolean upsertDictInfo(DictConfPO dictConfPO);
DictConfPO getDictInfoByDomainId(Long domainId);
}

View File

@@ -0,0 +1,16 @@
package com.tencent.supersonic.knowledge.infrastructure.custom;
import com.tencent.supersonic.knowledge.domain.dataobject.DimValueDictTaskPO;
import com.tencent.supersonic.knowledge.domain.pojo.DictTaskFilter;
import java.util.List;
import org.apache.ibatis.annotations.Mapper;
@Mapper
public interface DictTaskMapper {
Long createDimValueTask(DimValueDictTaskPO dimValueDictTaskPO);
Boolean updateTaskStatus(DimValueDictTaskPO dimValueDictTaskPO);
List<DimValueDictTaskPO> searchDictTaskList(DictTaskFilter filter);
}

View File

@@ -0,0 +1,35 @@
package com.tencent.supersonic.knowledge.infrastructure.nlp;
import com.hankcs.hanlp.corpus.tag.Nature;
import com.hankcs.hanlp.dictionary.CoreDictionary;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
/**
* Dictionary Attribute Util
*/
public class DictionaryAttributeUtil {
public static CoreDictionary.Attribute getAttribute(CoreDictionary.Attribute old, CoreDictionary.Attribute add) {
Map<Nature, Integer> map = new HashMap<>();
IntStream.range(0, old.nature.length).boxed().forEach(i -> map.put(old.nature[i], old.frequency[i]));
IntStream.range(0, add.nature.length).boxed().forEach(i -> map.put(add.nature[i], add.frequency[i]));
List<Map.Entry<Nature, Integer>> list = new LinkedList<Map.Entry<Nature, Integer>>(map.entrySet());
Collections.sort(list, new Comparator<Map.Entry<Nature, Integer>>() {
public int compare(Map.Entry<Nature, Integer> o1, Map.Entry<Nature, Integer> o2) {
return o2.getValue() - o1.getValue();
}
});
CoreDictionary.Attribute attribute = new CoreDictionary.Attribute(
list.stream().map(i -> i.getKey()).collect(Collectors.toList()).toArray(new Nature[0]),
list.stream().map(i -> i.getValue()).mapToInt(Integer::intValue).toArray(),
list.stream().map(i -> i.getValue()).findFirst().get());
return attribute;
}
}

View File

@@ -0,0 +1,81 @@
package com.tencent.supersonic.knowledge.infrastructure.nlp;
import static com.hankcs.hanlp.HanLP.Config.CustomDictionaryPath;
import static com.tencent.supersonic.knowledge.infrastructure.nlp.HanlpHelper.FILE_SPILT;
import com.hankcs.hanlp.dictionary.DynamicCustomDictionary;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class FileHelper {
private static final Logger LOGGER = LoggerFactory.getLogger(FileHelper.class);
public static void deleteCacheFile(String[] path) throws IOException {
String customPath = getCustomPath(path);
File customFolder = new File(customPath);
File[] customSubFiles = getFileList(customFolder, ".bin");
for (File file : customSubFiles) {
try {
file.delete();
LOGGER.info("customPath:{},delete cache file:{}", customPath, file);
} catch (Exception e) {
LOGGER.error("delete " + file, e);
}
}
}
private static File[] getFileList(File customFolder, String suffix) {
File[] customSubFiles = customFolder.listFiles(file -> {
if (file.isDirectory()) {
return false;
}
if (file.getName().toLowerCase().endsWith(suffix)) {
return true;
}
return false;
});
return customSubFiles;
}
private static String getCustomPath(String[] path) {
return path[0].substring(0, path[0].lastIndexOf(FILE_SPILT)) + FILE_SPILT;
}
/**
* reset path
*
* @param customDictionary
*/
public static void resetCustomPath(DynamicCustomDictionary customDictionary) {
String[] path = CustomDictionaryPath;
String customPath = getCustomPath(path);
File customFolder = new File(customPath);
File[] customSubFiles = getFileList(customFolder, ".txt");
List<String> fileList = new ArrayList<>();
for (File file : customSubFiles) {
if (file.isFile()) {
fileList.add(file.getAbsolutePath());
}
}
LOGGER.info("CustomDictionaryPath:{}", fileList);
CustomDictionaryPath = fileList.toArray(new String[0]);
customDictionary.path = (CustomDictionaryPath == null || CustomDictionaryPath.length == 0) ? path
: CustomDictionaryPath;
if (CustomDictionaryPath == null || CustomDictionaryPath.length == 0) {
CustomDictionaryPath = path;
}
}
}

View File

@@ -0,0 +1,33 @@
package com.tencent.supersonic.knowledge.infrastructure.nlp;
import com.hankcs.hanlp.corpus.io.IIOAdapter;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.URI;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class HadoopFileIOAdapter implements IIOAdapter {
private static final Logger LOGGER = LoggerFactory.getLogger(HadoopFileIOAdapter.class);
@Override
public InputStream open(String path) throws IOException {
LOGGER.info("open:{}", path);
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(URI.create(path), conf);
return fs.open(new Path(path));
}
@Override
public OutputStream create(String path) throws IOException {
LOGGER.info("create:{}", path);
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(URI.create(path), conf);
return fs.create(new Path(path));
}
}

View File

@@ -0,0 +1,157 @@
package com.tencent.supersonic.knowledge.infrastructure.nlp;
import static com.hankcs.hanlp.HanLP.Config.CustomDictionaryPath;
import com.hankcs.hanlp.HanLP;
import com.hankcs.hanlp.dictionary.DynamicCustomDictionary;
import com.hankcs.hanlp.seg.Segment;
import com.tencent.supersonic.common.nlp.WordNature;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.Arrays;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.util.ResourceUtils;
/**
* HanLP helper
*/
public class HanlpHelper {
private static final Logger LOGGER = LoggerFactory.getLogger(HanlpHelper.class);
public static final String FILE_SPILT = "/";
public static final String SPACE_SPILT = "#";
private static volatile Segment segment;
public static volatile DynamicCustomDictionary CustomDictionary;
public static final String DICT_MAIN_FILE_NAME = "CustomDictionary.txt";
public static final String DICT_CLASS = "classes";
public static final String NER = "crf";
static {
// reset hanlp config
try {
resetHanlpConfig();
} catch (FileNotFoundException e) {
LOGGER.error("resetHanlpConfig error", e);
}
}
public static Segment getSegment() {
if (segment == null) {
synchronized (HanlpHelper.class) {
if (segment == null) {
segment = HanLP.newSegment(NER)
.enableIndexMode(true).enableIndexMode(4)
.enableCustomDictionary(true).enableCustomDictionaryForcing(true).enableOffset(true)
.enableJapaneseNameRecognize(false).enableNameRecognize(false)
.enableAllNamedEntityRecognize(false)
.enableJapaneseNameRecognize(false).enableNumberQuantifierRecognize(false)
.enablePlaceRecognize(false)
.enableOrganizationRecognize(false).enableCustomDictionary(getDynamicCustomDictionary());
}
}
}
return segment;
}
public static DynamicCustomDictionary getDynamicCustomDictionary() {
if (CustomDictionary == null) {
synchronized (HanlpHelper.class) {
if (CustomDictionary == null) {
CustomDictionary = new MultiCustomDictionary(CustomDictionaryPath);
}
}
}
return CustomDictionary;
}
/***
* reload custom dictionary
*/
public static boolean reloadCustomDictionary() throws IOException {
LOGGER.info("reloadCustomDictionary start");
final long startTime = System.currentTimeMillis();
if (CustomDictionaryPath == null || CustomDictionaryPath.length == 0) {
return false;
}
if (HanLP.Config.IOAdapter instanceof HadoopFileIOAdapter) {
// 1.delete hdfs file
HdfsFileHelper.deleteCacheFile(CustomDictionaryPath);
// 2.query txt filesupdate CustomDictionaryPath
HdfsFileHelper.resetCustomPath(getDynamicCustomDictionary());
} else {
FileHelper.deleteCacheFile(CustomDictionaryPath);
FileHelper.resetCustomPath(getDynamicCustomDictionary());
}
// 3.clear trie
Suggester.clear();
boolean reload = getDynamicCustomDictionary().reload();
LOGGER.info("reloadCustomDictionary end ,cost:{},reload:{}", System.currentTimeMillis() - startTime, reload);
return reload;
}
private static void resetHanlpConfig() throws FileNotFoundException {
if (HanLP.Config.IOAdapter instanceof HadoopFileIOAdapter) {
return;
}
String hanlpPropertiesPath = getHanlpPropertiesPath();
CustomDictionaryPath = Arrays.stream(CustomDictionaryPath).map(path -> hanlpPropertiesPath + FILE_SPILT + path)
.toArray(String[]::new);
LOGGER.info("hanlpPropertiesPath:{},CustomDictionaryPath:{}", hanlpPropertiesPath, CustomDictionaryPath);
HanLP.Config.CoreDictionaryPath = hanlpPropertiesPath + FILE_SPILT + HanLP.Config.BiGramDictionaryPath;
HanLP.Config.CoreDictionaryTransformMatrixDictionaryPath = hanlpPropertiesPath + FILE_SPILT
+ HanLP.Config.CoreDictionaryTransformMatrixDictionaryPath;
HanLP.Config.BiGramDictionaryPath = hanlpPropertiesPath + FILE_SPILT + HanLP.Config.BiGramDictionaryPath;
HanLP.Config.CoreStopWordDictionaryPath =
hanlpPropertiesPath + FILE_SPILT + HanLP.Config.CoreStopWordDictionaryPath;
HanLP.Config.CoreSynonymDictionaryDictionaryPath = hanlpPropertiesPath + FILE_SPILT
+ HanLP.Config.CoreSynonymDictionaryDictionaryPath;
HanLP.Config.PersonDictionaryPath = hanlpPropertiesPath + FILE_SPILT + HanLP.Config.PersonDictionaryPath;
HanLP.Config.PersonDictionaryTrPath = hanlpPropertiesPath + FILE_SPILT + HanLP.Config.PersonDictionaryTrPath;
HanLP.Config.PinyinDictionaryPath = hanlpPropertiesPath + FILE_SPILT + HanLP.Config.PinyinDictionaryPath;
HanLP.Config.TranslatedPersonDictionaryPath = hanlpPropertiesPath + FILE_SPILT
+ HanLP.Config.TranslatedPersonDictionaryPath;
HanLP.Config.JapanesePersonDictionaryPath = hanlpPropertiesPath + FILE_SPILT
+ HanLP.Config.JapanesePersonDictionaryPath;
HanLP.Config.PlaceDictionaryPath = hanlpPropertiesPath + FILE_SPILT + HanLP.Config.PlaceDictionaryPath;
HanLP.Config.PlaceDictionaryTrPath = hanlpPropertiesPath + FILE_SPILT + HanLP.Config.PlaceDictionaryTrPath;
HanLP.Config.OrganizationDictionaryPath = hanlpPropertiesPath + FILE_SPILT
+ HanLP.Config.OrganizationDictionaryPath;
HanLP.Config.OrganizationDictionaryTrPath = hanlpPropertiesPath + FILE_SPILT
+ HanLP.Config.OrganizationDictionaryTrPath;
HanLP.Config.CharTypePath = hanlpPropertiesPath + FILE_SPILT + HanLP.Config.CharTypePath;
HanLP.Config.CharTablePath = hanlpPropertiesPath + FILE_SPILT + HanLP.Config.CharTablePath;
HanLP.Config.PartOfSpeechTagDictionary =
hanlpPropertiesPath + FILE_SPILT + HanLP.Config.PartOfSpeechTagDictionary;
HanLP.Config.WordNatureModelPath = hanlpPropertiesPath + FILE_SPILT + HanLP.Config.WordNatureModelPath;
HanLP.Config.MaxEntModelPath = hanlpPropertiesPath + FILE_SPILT + HanLP.Config.MaxEntModelPath;
HanLP.Config.NNParserModelPath = hanlpPropertiesPath + FILE_SPILT + HanLP.Config.NNParserModelPath;
HanLP.Config.PerceptronParserModelPath =
hanlpPropertiesPath + FILE_SPILT + HanLP.Config.PerceptronParserModelPath;
HanLP.Config.CRFSegmentModelPath = hanlpPropertiesPath + FILE_SPILT + HanLP.Config.CRFSegmentModelPath;
HanLP.Config.HMMSegmentModelPath = hanlpPropertiesPath + FILE_SPILT + HanLP.Config.HMMSegmentModelPath;
HanLP.Config.CRFCWSModelPath = hanlpPropertiesPath + FILE_SPILT + HanLP.Config.CRFCWSModelPath;
HanLP.Config.CRFPOSModelPath = hanlpPropertiesPath + FILE_SPILT + HanLP.Config.CRFPOSModelPath;
HanLP.Config.CRFNERModelPath = hanlpPropertiesPath + FILE_SPILT + HanLP.Config.CRFNERModelPath;
HanLP.Config.PerceptronCWSModelPath = hanlpPropertiesPath + FILE_SPILT + HanLP.Config.PerceptronCWSModelPath;
HanLP.Config.PerceptronPOSModelPath = hanlpPropertiesPath + FILE_SPILT + HanLP.Config.PerceptronPOSModelPath;
HanLP.Config.PerceptronNERModelPath = hanlpPropertiesPath + FILE_SPILT + HanLP.Config.PerceptronNERModelPath;
}
public static String getHanlpPropertiesPath() throws FileNotFoundException {
return ResourceUtils.getFile("classpath:hanlp.properties").getParent();
}
public static boolean addToCustomDictionary(WordNature wordNature) {
LOGGER.debug("wordNature:{}", wordNature);
return getDynamicCustomDictionary().insert(wordNature.getWord(), wordNature.getNatureWithFrequency());
}
}

View File

@@ -0,0 +1,85 @@
package com.tencent.supersonic.knowledge.infrastructure.nlp;
import static com.hankcs.hanlp.HanLP.Config.CustomDictionaryPath;
import static com.tencent.supersonic.knowledge.infrastructure.nlp.HanlpHelper.FILE_SPILT;
import com.hankcs.hanlp.dictionary.DynamicCustomDictionary;
import com.hankcs.hanlp.utility.Predefine;
import java.io.IOException;
import java.net.URI;
import java.util.ArrayList;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Hdfs File Helper
*/
public class HdfsFileHelper {
private static final Logger LOGGER = LoggerFactory.getLogger(HdfsFileHelper.class);
/***
* delete cache file
* @param path
* @throws IOException
*/
public static void deleteCacheFile(String[] path) throws IOException {
FileSystem fs = FileSystem.get(URI.create(path[0]), new Configuration());
String cacheFilePath = path[0] + Predefine.BIN_EXT;
LOGGER.info("delete cache file:{}", cacheFilePath);
try {
fs.delete(new Path(cacheFilePath), false);
} catch (Exception e) {
LOGGER.error("delete:" + cacheFilePath, e);
}
int customBase = cacheFilePath.lastIndexOf(FILE_SPILT);
String customPath = cacheFilePath.substring(0, customBase) + FILE_SPILT + "*.bin";
List<String> fileList = getFileList(fs, new Path(customPath));
for (String file : fileList) {
try {
fs.delete(new Path(file), false);
LOGGER.info("delete cache file:{}", file);
} catch (Exception e) {
LOGGER.error("delete " + file, e);
}
}
LOGGER.info("fileList:{}", fileList);
}
/**
* reset path
* @param customDictionary
* @throws IOException
*/
public static void resetCustomPath(DynamicCustomDictionary customDictionary) throws IOException {
String[] path = CustomDictionaryPath;
FileSystem fs = FileSystem.get(URI.create(path[0]), new Configuration());
String cacheFilePath = path[0] + Predefine.BIN_EXT;
int customBase = cacheFilePath.lastIndexOf(FILE_SPILT);
String customPath = cacheFilePath.substring(0, customBase) + FILE_SPILT + "*.txt";
LOGGER.info("customPath:{}", customPath);
List<String> fileList = getFileList(fs, new Path(customPath));
LOGGER.info("CustomDictionaryPath:{}", fileList);
CustomDictionaryPath = fileList.toArray(new String[0]);
customDictionary.path = (CustomDictionaryPath == null || CustomDictionaryPath.length == 0) ? path
: CustomDictionaryPath;
if (CustomDictionaryPath == null || CustomDictionaryPath.length == 0) {
CustomDictionaryPath = path;
}
}
public static List<String> getFileList(FileSystem fs, Path folderPath) throws IOException {
List<String> paths = new ArrayList();
FileStatus[] fileStatuses = fs.globStatus(folderPath);
for (int i = 0; i < fileStatuses.length; i++) {
FileStatus fileStatus = fileStatuses[i];
paths.add(fileStatus.getPath().toString());
}
return paths;
}
}

View File

@@ -0,0 +1,368 @@
package com.tencent.supersonic.knowledge.infrastructure.nlp;
import static com.hankcs.hanlp.utility.Predefine.logger;
import com.hankcs.hanlp.HanLP;
import com.hankcs.hanlp.collection.trie.DoubleArrayTrie;
import com.hankcs.hanlp.collection.trie.bintrie.BinTrie;
import com.hankcs.hanlp.corpus.io.ByteArray;
import com.hankcs.hanlp.corpus.io.IOUtil;
import com.hankcs.hanlp.corpus.tag.Nature;
import com.hankcs.hanlp.dictionary.CoreDictionary;
import com.hankcs.hanlp.dictionary.DynamicCustomDictionary;
import com.hankcs.hanlp.dictionary.other.CharTable;
import com.hankcs.hanlp.utility.LexiconUtility;
import com.hankcs.hanlp.utility.Predefine;
import com.hankcs.hanlp.utility.TextUtility;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.Arrays;
import java.util.LinkedHashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
public class MultiCustomDictionary extends DynamicCustomDictionary {
public static Boolean removeDuplicates = true;
private static boolean addToSuggesterTrie = true;
public MultiCustomDictionary() {
this(HanLP.Config.CustomDictionaryPath);
}
public MultiCustomDictionary(String... path) {
super(path);
}
public boolean load(String... path) {
this.path = path;
long start = System.currentTimeMillis();
if (!this.loadMainDictionary(path[0])) {
Predefine.logger.warning("自定义词典" + Arrays.toString(path) + "加载失败");
return false;
} else {
Predefine.logger.info(
"自定义词典加载成功:" + this.dat.size() + "个词条,耗时" + (System.currentTimeMillis() - start) + "ms");
this.path = path;
return true;
}
}
/***
* load dictionary
* @param path
* @param defaultNature
* @param map
* @param customNatureCollector
* @param addToSuggeterTrie
* @return
*/
public static boolean load(String path, Nature defaultNature, TreeMap<String, CoreDictionary.Attribute> map,
LinkedHashSet<Nature> customNatureCollector, boolean addToSuggeterTrie) {
try {
String splitter = "\\s";
if (path.endsWith(".csv")) {
splitter = ",";
}
BufferedReader br = new BufferedReader(new InputStreamReader(IOUtil.newInputStream(path), "UTF-8"));
boolean firstLine = true;
while (true) {
String[] param;
do {
String line;
if ((line = br.readLine()) == null) {
br.close();
return true;
}
if (firstLine) {
line = IOUtil.removeUTF8BOM(line);
firstLine = false;
}
param = line.split(splitter);
} while (param[0].length() == 0);
if (HanLP.Config.Normalization) {
param[0] = CharTable.convert(param[0]);
}
int natureCount = (param.length - 1) / 2;
CoreDictionary.Attribute attribute;
boolean isLetters = isLetters(param[0]);
String original = null;
String word = getWordBySpace(param[0]);
if (isLetters) {
original = word;
word = word.toLowerCase();
}
if (natureCount == 0) {
attribute = new CoreDictionary.Attribute(defaultNature);
} else {
attribute = new CoreDictionary.Attribute(natureCount);
for (int i = 0; i < natureCount; ++i) {
attribute.nature[i] = LexiconUtility.convertStringToNature(param[1 + 2 * i],
customNatureCollector);
attribute.frequency[i] = Integer.parseInt(param[2 + 2 * i]);
attribute.totalFrequency += attribute.frequency[i];
}
}
attribute.original = original;
if (removeDuplicates && map.containsKey(word)) {
attribute = DictionaryAttributeUtil.getAttribute(map.get(word), attribute);
map.put(word, attribute);
if (addToSuggeterTrie) {
Suggester.put(word, attribute);
}
} else {
map.put(word, attribute);
if (addToSuggeterTrie) {
Suggester.put(word, attribute);
}
}
}
} catch (Exception var12) {
logger.severe("自定义词典" + path + "读取错误!" + var12);
return false;
}
}
public boolean loadMainDictionary(String mainPath) {
return loadMainDictionary(mainPath, this.path, this.dat, true, addToSuggesterTrie);
}
/***
* load main dictionary
* @param mainPath
* @param path
* @param dat
* @param isCache
* @param addToSuggestTrie
* @return
*/
public static boolean loadMainDictionary(String mainPath, String[] path,
DoubleArrayTrie<CoreDictionary.Attribute> dat, boolean isCache, boolean addToSuggestTrie) {
Predefine.logger.info("自定义词典开始加载:" + mainPath);
if (loadDat(mainPath, dat)) {
return true;
} else {
TreeMap<String, CoreDictionary.Attribute> map = new TreeMap();
LinkedHashSet customNatureCollector = new LinkedHashSet();
try {
for (String p : path) {
Nature defaultNature = Nature.n;
File file = new File(p);
String fileName = file.getName();
int cut = fileName.lastIndexOf(32);
if (cut > 0) {
String nature = fileName.substring(cut + 1);
p = file.getParent() + File.separator + fileName.substring(0, cut);
try {
defaultNature = LexiconUtility.convertStringToNature(nature, customNatureCollector);
} catch (Exception var16) {
Predefine.logger.severe("配置文件【" + p + "】写错了!" + var16);
continue;
}
}
Predefine.logger.info("以默认词性[" + defaultNature + "]加载自定义词典" + p + "中……");
boolean success = load(p, defaultNature, map, customNatureCollector, addToSuggestTrie);
if (!success) {
Predefine.logger.warning("失败:" + p);
}
}
if (map.size() == 0) {
Predefine.logger.warning("没有加载到任何词条");
map.put("未##它", null);
}
logger.info("正在构建DoubleArrayTrie……");
dat.build(map);
if (addToSuggestTrie) {
// Suggester.save();
}
if (isCache) {
// 缓存成dat文件下次加载会快很多
logger.info("正在缓存词典为dat文件……");
// 缓存值文件
List<CoreDictionary.Attribute> attributeList = new LinkedList<CoreDictionary.Attribute>();
for (Map.Entry<String, CoreDictionary.Attribute> entry : map.entrySet()) {
attributeList.add(entry.getValue());
}
DataOutputStream out = new DataOutputStream(
new BufferedOutputStream(IOUtil.newOutputStream(mainPath + ".bin")));
if (customNatureCollector.isEmpty()) {
for (int i = Nature.begin.ordinal() + 1; i < Nature.values().length; ++i) {
customNatureCollector.add(Nature.values()[i]);
}
}
IOUtil.writeCustomNature(out, customNatureCollector);
out.writeInt(attributeList.size());
for (CoreDictionary.Attribute attribute : attributeList) {
attribute.save(out);
}
dat.save(out);
out.close();
}
} catch (FileNotFoundException var17) {
logger.severe("自定义词典" + mainPath + "不存在!" + var17);
return false;
} catch (IOException var18) {
logger.severe("自定义词典" + mainPath + "读取错误!" + var18);
return false;
} catch (Exception var19) {
logger.warning("自定义词典" + mainPath + "缓存失败!\n" + TextUtility.exceptionToString(var19));
}
return true;
}
}
public static boolean loadDat(String path, DoubleArrayTrie<CoreDictionary.Attribute> dat) {
return loadDat(path, HanLP.Config.CustomDictionaryPath, dat);
}
public static boolean loadDat(String path, String[] customDicPath, DoubleArrayTrie<CoreDictionary.Attribute> dat) {
try {
if (HanLP.Config.CustomDictionaryAutoRefreshCache && isDicNeedUpdate(path, customDicPath)) {
return false;
} else {
ByteArray byteArray = ByteArray.createByteArray(path + ".bin");
if (byteArray == null) {
return false;
} else {
int size = byteArray.nextInt();
if (size < 0) {
while (true) {
++size;
if (size > 0) {
size = byteArray.nextInt();
break;
}
Nature.create(byteArray.nextString());
}
}
CoreDictionary.Attribute[] attributes = new CoreDictionary.Attribute[size];
Nature[] natureIndexArray = Nature.values();
for (int i = 0; i < size; ++i) {
int currentTotalFrequency = byteArray.nextInt();
int length = byteArray.nextInt();
attributes[i] = new CoreDictionary.Attribute(length);
attributes[i].totalFrequency = currentTotalFrequency;
for (int j = 0; j < length; ++j) {
attributes[i].nature[j] = natureIndexArray[byteArray.nextInt()];
attributes[i].frequency[j] = byteArray.nextInt();
}
}
if (!dat.load(byteArray, attributes)) {
return false;
} else {
return true;
}
}
}
} catch (Exception var11) {
logger.warning("读取失败,问题发生在" + TextUtility.exceptionToString(var11));
return false;
}
}
public boolean reload() {
if (this.path != null && this.path.length != 0) {
IOUtil.deleteFile(this.path[0] + ".bin");
Boolean loadCacheOk = this.loadDat(this.path[0], this.path, this.dat);
if (!loadCacheOk) {
return this.loadMainDictionary(this.path[0], this.path, this.dat, true, addToSuggesterTrie);
}
}
return false;
}
public boolean insert(String word, String natureWithFrequency) {
if (word == null) {
return false;
} else {
if (HanLP.Config.Normalization) {
word = CharTable.convert(word);
}
CoreDictionary.Attribute att = natureWithFrequency == null ? new CoreDictionary.Attribute(Nature.nz, 1)
: CoreDictionary.Attribute.create(natureWithFrequency);
boolean isLetters = isLetters(word);
word = getWordBySpace(word);
String original = null;
if (isLetters) {
original = word;
word = word.toLowerCase();
}
if (att == null) {
return false;
} else if (this.dat.containsKey(word)) {
att.original = original;
att = DictionaryAttributeUtil.getAttribute(this.dat.get(word), att);
this.dat.set(word, att);
// return true;
} else {
if (this.trie == null) {
this.trie = new BinTrie();
}
att.original = original;
if (this.trie.containsKey(word)) {
att = DictionaryAttributeUtil.getAttribute(this.trie.get(word), att);
}
this.trie.put(word, att);
// return true;
}
if (addToSuggesterTrie) {
Suggester.put(word, att);
}
return true;
}
}
public static boolean isLetters(String str) {
char[] chars = str.toCharArray();
if (chars.length <= 1) {
return false;
}
for (int i = 0; i < chars.length; i++) {
if ((chars[i] >= 'A' && chars[i] <= 'Z')) {
return true;
}
}
return false;
}
public static String getWordBySpace(String word) {
if (word.contains(HanlpHelper.SPACE_SPILT)) {
return word.replace(HanlpHelper.SPACE_SPILT, " ");
}
return word;
}
}

View File

@@ -0,0 +1,146 @@
package com.tencent.supersonic.knowledge.infrastructure.nlp;
import com.hankcs.hanlp.collection.trie.bintrie.BaseNode;
import com.hankcs.hanlp.collection.trie.bintrie.BinTrie;
import com.hankcs.hanlp.corpus.tag.Nature;
import com.hankcs.hanlp.dictionary.CoreDictionary;
import com.tencent.supersonic.common.nlp.MapResult;
import com.tencent.supersonic.common.nlp.NatureType;
import com.tencent.supersonic.common.nlp.WordNature;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.stereotype.Service;
import org.springframework.util.CollectionUtils;
@Service
public class Suggester {
private static final Logger LOGGER = LoggerFactory.getLogger(Suggester.class);
private static BinTrie<List<String>> trie;
private static BinTrie<List<String>> suffixTrie;
private static String localFileCache = "";
public static final int SEARCH_SIZE = 200;
static {
trie = new BinTrie<>();
suffixTrie = new BinTrie<>();
}
/***
* prefix Search
* @param key
* @return
*/
public static List<MapResult> prefixSearch(String key) {
return prefixSearch(key, SEARCH_SIZE, trie);
}
public static List<MapResult> prefixSearch(String key, int limit) {
return prefixSearch(key, limit, trie);
}
public static List<MapResult> prefixSearch(String key, int limit, BinTrie<List<String>> binTrie) {
Set<Map.Entry<String, List<String>>> result = prefixSearchLimit(key, limit, binTrie);
return result.stream().map(
entry -> {
String name = entry.getKey().replace("#", " ");
return new MapResult(name, entry.getValue());
}
).sorted((a, b) -> -(b.getName().length() - a.getName().length()))
.limit(SEARCH_SIZE)
.collect(Collectors.toList());
}
/***
* suffix Search
* @param key
* @return
*/
public static List<MapResult> suffixSearch(String key, int limit) {
String reverseDetectSegment = StringUtils.reverse(key);
return suffixSearch(reverseDetectSegment, limit, suffixTrie);
}
public static List<MapResult> suffixSearch(String key, int limit, BinTrie<List<String>> binTrie) {
Set<Map.Entry<String, List<String>>> result = prefixSearchLimit(key, limit, binTrie);
return result.stream().map(
entry -> {
String name = entry.getKey().replace("#", " ");
List<String> natures = entry.getValue().stream()
.map(nature -> nature.replaceAll(NatureType.SUFFIX.getType(), ""))
.collect(Collectors.toList());
name = StringUtils.reverse(name);
return new MapResult(name, natures);
}
).sorted((a, b) -> -(b.getName().length() - a.getName().length()))
.limit(SEARCH_SIZE)
.collect(Collectors.toList());
}
private static Set<Map.Entry<String, List<String>>> prefixSearchLimit(String key, int limit,
BinTrie<List<String>> binTrie) {
key = key.toLowerCase();
Set<Map.Entry<String, List<String>>> entrySet = new TreeSet<Map.Entry<String, List<String>>>();
StringBuilder sb = new StringBuilder(key.substring(0, key.length() - 1));
BaseNode branch = binTrie;
char[] chars = key.toCharArray();
for (char aChar : chars) {
if (branch == null) {
return entrySet;
}
branch = branch.getChild(aChar);
}
if (branch == null) {
return entrySet;
}
branch.walkLimit(sb, entrySet, limit);
return entrySet;
}
public static void clear() {
LOGGER.info("clear all trie");
trie = new BinTrie<>();
suffixTrie = new BinTrie<>();
}
public static void put(String key, CoreDictionary.Attribute attribute) {
trie.put(key, Arrays.stream(attribute.nature).map(entry -> entry.toString()).collect(Collectors.toList()));
}
public static void loadSuffix(List<WordNature> suffixes) {
if (CollectionUtils.isEmpty(suffixes)) {
return;
}
TreeMap<String, CoreDictionary.Attribute> map = new TreeMap();
for (WordNature suffix : suffixes) {
CoreDictionary.Attribute attributeNew = suffix.getNatureWithFrequency() == null
? new CoreDictionary.Attribute(Nature.nz, 1)
: CoreDictionary.Attribute.create(suffix.getNatureWithFrequency());
if (map.containsKey(suffix.getWord())) {
attributeNew = DictionaryAttributeUtil.getAttribute(map.get(suffix.getWord()), attributeNew);
}
map.put(suffix.getWord(), attributeNew);
}
for (Map.Entry<String, CoreDictionary.Attribute> stringAttributeEntry : map.entrySet()) {
putSuffix(stringAttributeEntry.getKey(), stringAttributeEntry.getValue());
}
}
public static void putSuffix(String key, CoreDictionary.Attribute attribute) {
suffixTrie.put(key,
Arrays.stream(attribute.nature).map(entry -> entry.toString()).collect(Collectors.toList()));
}
}

View File

@@ -0,0 +1,93 @@
package com.tencent.supersonic.knowledge.infrastructure.repository;
import com.tencent.supersonic.common.enums.TaskStatusEnum;
import com.tencent.supersonic.knowledge.domain.converter.DictTaskConverter;
import com.tencent.supersonic.knowledge.domain.dataobject.DictConfPO;
import com.tencent.supersonic.knowledge.domain.dataobject.DimValueDictTaskPO;
import com.tencent.supersonic.knowledge.domain.pojo.DictConfig;
import com.tencent.supersonic.knowledge.domain.pojo.DictTaskFilter;
import com.tencent.supersonic.knowledge.domain.pojo.DimValueDictInfo;
import com.tencent.supersonic.knowledge.domain.repository.DictRepository;
import com.tencent.supersonic.knowledge.infrastructure.custom.DictConfMapper;
import com.tencent.supersonic.knowledge.infrastructure.custom.DictTaskMapper;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Objects;
import java.util.concurrent.CompletableFuture;
import org.springframework.beans.BeanUtils;
import org.springframework.stereotype.Repository;
import org.springframework.util.CollectionUtils;
@Repository
public class DictRepositoryImpl implements DictRepository {
private final DictTaskMapper dictTaskMapper;
private final DictConfMapper dictConfMapper;
public DictRepositoryImpl(DictTaskMapper dictTaskMapper,
DictConfMapper dictConfMapper) {
this.dictTaskMapper = dictTaskMapper;
this.dictConfMapper = dictConfMapper;
}
@Override
public Long createDimValueDictTask(DimValueDictTaskPO dimValueDictTaskPO) {
dictTaskMapper.createDimValueTask(dimValueDictTaskPO);
return dimValueDictTaskPO.getId();
}
@Override
public Boolean updateDictTaskStatus(Integer status, DimValueDictTaskPO dimValueDictTaskPO) {
dimValueDictTaskPO.setStatus(status);
Date createdAt = dimValueDictTaskPO.getCreatedAt();
long elapsedMs = System.currentTimeMillis() - createdAt.getTime();
dimValueDictTaskPO.setElapsedMs(elapsedMs);
CompletableFuture.supplyAsync(() -> {
dictTaskMapper.updateTaskStatus(dimValueDictTaskPO);
return null;
});
return true;
}
@Override
public List<DimValueDictInfo> searchDictTaskList(DictTaskFilter filter) {
List<DimValueDictInfo> dimValueDictDescList = new ArrayList<>();
List<DimValueDictTaskPO> dimValueDictTaskPOList = dictTaskMapper.searchDictTaskList(filter);
if (!CollectionUtils.isEmpty(dimValueDictTaskPOList)) {
dimValueDictTaskPOList.stream().forEach(dictTaskPO -> {
DimValueDictInfo dimValueDictDesc = new DimValueDictInfo();
BeanUtils.copyProperties(dictTaskPO, dimValueDictDesc);
dimValueDictDesc.setStatus(TaskStatusEnum.of(dictTaskPO.getStatus()));
dimValueDictDescList.add(dimValueDictDesc);
});
}
return dimValueDictDescList;
}
@Override
public Boolean createDictConf(DictConfPO dictConfPO) {
return dictConfMapper.createDictConf(dictConfPO);
}
@Override
public Boolean editDictConf(DictConfPO dictConfPO) {
return dictConfMapper.editDictConf(dictConfPO);
}
@Override
public Boolean upsertDictInfo(DictConfPO dictConfPO) {
return dictConfMapper.upsertDictInfo(dictConfPO);
}
@Override
public DictConfig getDictInfoByDomainId(Long domainId) {
DictConfPO dictConfPO = dictConfMapper.getDictInfoByDomainId(domainId);
if (Objects.isNull(dictConfPO)) {
return null;
}
return DictTaskConverter.dictConfPO2Config(dictConfPO);
}
}

View File

@@ -0,0 +1,2 @@
com.tencent.supersonic.knowledge.domain.FileHandler=\
com.tencent.supersonic.knowledge.domain.LocalFileHandler

View File

@@ -0,0 +1,53 @@
<?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE mapper PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN"
"http://mybatis.org/dtd/mybatis-3-mapper.dtd">
<mapper namespace="com.tencent.supersonic.knowledge.infrastructure.custom.DictConfMapper">
<resultMap id="DictConfPO"
type="com.tencent.supersonic.knowledge.domain.dataobject.DictConfPO">
<id column="id" property="id"/>
<result column="domain_id" property="domainId"/>
<result column="dim_value_infos" property="dimValueInfos"/>
<result column="created_at" property="createdAt"/>
<result column="updated_at" property="updatedAt"/>
<result column="created_by" property="createdBy"/>
<result column="updated_by" property="updatedBy"/>
</resultMap>
<insert id="createDictConf">
insert into s2_dictionary
(`domain_id`, dim_value_infos, created_at, updated_at, created_by, updated_by)
values
(#{domainId}, #{dimValueInfos}, #{createdAt}, #{updatedAt}, #{createdBy}, #{updatedBy})
</insert>
<insert id="upsertDictInfo">
insert into s2_dictionary
(`domain_id`, dim_value_infos, created_at, updated_at, created_by, updated_by)
values
(#{domainId}, #{dimValueInfos}, #{createdAt}, #{updatedAt}, #{createdBy}, #{updatedBy})
on duplicate key update
dim_value_infos = #{dimValueInfos},
updated_at = #{updatedAt},
updated_by = #{updatedBy}
</insert>
<update id="editDictConf">
update s2_dictionary
set dim_value_infos = #{dimValueInfos},
updated_at = #{updatedAt},
updated_by = #{updatedBy}
where domain_id = #{domainId}
and status = 0
</update>
<select id="getDictInfoByDomainId" resultMap="DictConfPO">
select *
from s2_dictionary
where domain_id = #{domainId}
and status = 0
</select>
</mapper>

View File

@@ -0,0 +1,71 @@
<?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE mapper PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN"
"http://mybatis.org/dtd/mybatis-3-mapper.dtd">
<mapper namespace="com.tencent.supersonic.knowledge.infrastructure.custom.DictTaskMapper">
<resultMap id="DimValueDictTaskPO"
type="com.tencent.supersonic.knowledge.domain.dataobject.DimValueDictTaskPO">
<id column="id" property="id"/>
<result column="name" property="name"/>
<result column="description" property="description"/>
<result column="command" property="command"/>
<result column="command_md5" property="commandMd5"/>
<result column="status" property="status"/>
<result column="created_by" property="createdBy"/>
<result column="created_at" property="createdAt"/>
<result column="progress" property="progress"/>
<result column="elapsed_ms" property="elapsedMs"/>
</resultMap>
<insert id="createDimValueTask">
insert into s2_dictionary_task
(`name`, description, command, command_md5, status, created_by, progress, elapsed_ms)
values
(#{name}, #{description}, #{command}, #{commandMd5}, #{status}, #{createdBy}, #{progress}, #{elapsedMs})
</insert>
<update id="updateTaskStatus">
update s2_dictionary_task
<set>
<if test="description != null and description !=''">
description = #{description},
</if>
<if test="status != null">
status = #{status},
</if>
<if test="progress != null">
progress = #{progress},
</if>
<if test="elapsedMs != null">
elapsed_ms = #{elapsedMs},
</if>
</set>
where name = #{name}
and status = 0
</update>
<select id="searchDictTaskList" resultMap="DimValueDictTaskPO">
select *
from s2_dictionary_task
<where>
<if test="id != null and id != ''">
and id >= #{id}
</if>
<if test="name != null and name !=''">
and `name` like "%"#{name}"%"
</if>
<if test="createdBy != null and createdBy !=''">
and created_by = #{createdBy}
</if>
<if test="createdAt != null and createdAt !=''">
and created_at &gt;= #{createdAt}
</if>
<if test="status != null and status !=''">
and status= #{status}
</if>
</where>
</select>
</mapper>

View File

@@ -0,0 +1,15 @@
CREATE TABLE IF NOT EXISTS `s2_dictionary` (
`id` bigint(20) unsigned NOT NULL AUTO_INCREMENT,
`item_id` bigint(20) DEFAULT NULL COMMENT '对应维度id、指标id等',
`type` varchar(50) DEFAULT NULL COMMENT '对应维度、指标等',
`black_list` mediumtext COMMENT '字典黑名单',
`white_list` mediumtext COMMENT '字典白名单',
`rule_list` mediumtext COMMENT '字典规则',
`is_dict_Info` tinyint(1) NOT NULL DEFAULT '0' COMMENT '1-开启写入字典0-不开启',
`created_at` datetime NOT NULL COMMENT '创建时间',
`updated_at` datetime NOT NULL COMMENT '更新时间',
`created_by` varchar(100) NOT NULL COMMENT '创建人',
`updated_by` varchar(100) DEFAULT NULL COMMENT '更新人',
`is_deleted` tinyint(1) NOT NULL DEFAULT '0' COMMENT '1-删除,0-可用',
PRIMARY KEY (`id`)
) COMMENT='字典配置信息表'

View File

@@ -0,0 +1,11 @@
CREATE TABLE IF NOT EXISTS `s2_dictionary_task` (
`id` bigint(20) unsigned NOT NULL AUTO_INCREMENT,
`name` varchar(255) NOT NULL COMMENT '任务名称',
`description` varchar(255) NOT NULL COMMENT '任务描述',
`command` mediumtext NOT NULL COMMENT '任务请求参数',
`status` int(10) NOT NULL COMMENT '任务最终运行状态',
`created_at` datetime DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
`created_by` varchar(100) NOT NULL COMMENT '创建人',
`elapsed_ms` bigint(10) DEFAULT NULL COMMENT '任务耗时',
PRIMARY KEY (`id`)
)COMMENT='字典任务信息表'