mirror of
https://github.com/tencentmusic/supersonic.git
synced 2025-12-11 12:07:42 +00:00
headless integrates knowledge (#722)
This commit is contained in:
@@ -190,6 +190,11 @@
|
||||
<groupId>dev.langchain4j</groupId>
|
||||
<artifactId>langchain4j-embeddings</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.hankcs</groupId>
|
||||
<artifactId>hanlp</artifactId>
|
||||
<version>${hanlp.version}</version>
|
||||
</dependency>
|
||||
|
||||
</dependencies>
|
||||
|
||||
|
||||
63
common/src/main/java/com/hankcs/hanlp/LoadRemoveService.java
Normal file
63
common/src/main/java/com/hankcs/hanlp/LoadRemoveService.java
Normal file
@@ -0,0 +1,63 @@
|
||||
package com.hankcs.hanlp;
|
||||
|
||||
import com.tencent.supersonic.common.pojo.enums.DictWordType;
|
||||
import lombok.Data;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.util.CollectionUtils;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.Set;
|
||||
|
||||
@Data
|
||||
@Slf4j
|
||||
public class LoadRemoveService {
|
||||
|
||||
@Value("${mapper.remove.nature.prefix:}")
|
||||
private String mapperRemoveNaturePrefix;
|
||||
|
||||
public List removeNatures(List value, Set<Long> detectModelIds) {
|
||||
if (CollectionUtils.isEmpty(value)) {
|
||||
return value;
|
||||
}
|
||||
List<String> resultList = new ArrayList<>(value);
|
||||
if (!CollectionUtils.isEmpty(detectModelIds)) {
|
||||
resultList.removeIf(nature -> {
|
||||
if (Objects.isNull(nature)) {
|
||||
return false;
|
||||
}
|
||||
Long modelId = getViewId(nature);
|
||||
if (Objects.nonNull(modelId)) {
|
||||
return !detectModelIds.contains(modelId);
|
||||
}
|
||||
return false;
|
||||
});
|
||||
}
|
||||
if (StringUtils.isNotBlank(mapperRemoveNaturePrefix)) {
|
||||
resultList.removeIf(nature -> {
|
||||
if (Objects.isNull(nature)) {
|
||||
return false;
|
||||
}
|
||||
return nature.startsWith(mapperRemoveNaturePrefix);
|
||||
});
|
||||
}
|
||||
return resultList;
|
||||
}
|
||||
|
||||
public Long getViewId(String nature) {
|
||||
try {
|
||||
String[] split = nature.split(DictWordType.NATURE_SPILT);
|
||||
if (split.length <= 1) {
|
||||
return null;
|
||||
}
|
||||
return Long.valueOf(split[1]);
|
||||
} catch (NumberFormatException e) {
|
||||
log.error("", e);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,331 @@
|
||||
package com.hankcs.hanlp.collection.trie.bintrie;
|
||||
|
||||
import com.hankcs.hanlp.LoadRemoveService;
|
||||
import com.hankcs.hanlp.corpus.io.ByteArray;
|
||||
import java.io.DataOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.ObjectInput;
|
||||
import java.io.ObjectOutput;
|
||||
import java.util.AbstractMap;
|
||||
import java.util.ArrayDeque;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import java.util.Queue;
|
||||
import java.util.Set;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
|
||||
public abstract class BaseNode<V> implements Comparable<BaseNode> {
|
||||
|
||||
/**
|
||||
* 状态数组,方便读取的时候用
|
||||
*/
|
||||
static final Status[] ARRAY_STATUS = Status.values();
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(BaseNode.class);
|
||||
/**
|
||||
* 子节点
|
||||
*/
|
||||
protected BaseNode[] child;
|
||||
/**
|
||||
* 节点状态
|
||||
*/
|
||||
protected Status status;
|
||||
/**
|
||||
* 节点代表的字符
|
||||
*/
|
||||
protected char c;
|
||||
/**
|
||||
* 节点代表的值
|
||||
*/
|
||||
protected V value;
|
||||
|
||||
protected String prefix = null;
|
||||
|
||||
public BaseNode<V> transition(String path, int begin) {
|
||||
BaseNode<V> cur = this;
|
||||
for (int i = begin; i < path.length(); ++i) {
|
||||
cur = cur.getChild(path.charAt(i));
|
||||
if (cur == null || cur.status == Status.UNDEFINED_0) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
return cur;
|
||||
}
|
||||
|
||||
public BaseNode<V> transition(char[] path, int begin) {
|
||||
BaseNode<V> cur = this;
|
||||
for (int i = begin; i < path.length; ++i) {
|
||||
cur = cur.getChild(path[i]);
|
||||
if (cur == null || cur.status == Status.UNDEFINED_0) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
return cur;
|
||||
}
|
||||
|
||||
/**
|
||||
* 转移状态
|
||||
*
|
||||
* @param path
|
||||
* @return
|
||||
*/
|
||||
public BaseNode<V> transition(char path) {
|
||||
BaseNode<V> cur = this;
|
||||
cur = cur.getChild(path);
|
||||
if (cur == null || cur.status == Status.UNDEFINED_0) {
|
||||
return null;
|
||||
}
|
||||
return cur;
|
||||
}
|
||||
|
||||
/**
|
||||
* 添加子节点
|
||||
*
|
||||
* @return true-新增了节点 false-修改了现有节点
|
||||
*/
|
||||
protected abstract boolean addChild(BaseNode node);
|
||||
|
||||
/**
|
||||
* 是否含有子节点
|
||||
*
|
||||
* @param c 子节点的char
|
||||
* @return 是否含有
|
||||
*/
|
||||
protected boolean hasChild(char c) {
|
||||
return getChild(c) != null;
|
||||
}
|
||||
|
||||
protected char getChar() {
|
||||
return c;
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取子节点
|
||||
*
|
||||
* @param c 子节点的char
|
||||
* @return 子节点
|
||||
*/
|
||||
public abstract BaseNode getChild(char c);
|
||||
|
||||
/**
|
||||
* 获取节点对应的值
|
||||
*
|
||||
* @return 值
|
||||
*/
|
||||
public final V getValue() {
|
||||
return value;
|
||||
}
|
||||
|
||||
/**
|
||||
* 设置节点对应的值
|
||||
*
|
||||
* @param value 值
|
||||
*/
|
||||
public final void setValue(V value) {
|
||||
this.value = value;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int compareTo(BaseNode other) {
|
||||
return compareTo(other.getChar());
|
||||
}
|
||||
|
||||
/**
|
||||
* 重载,与字符的比较
|
||||
*
|
||||
* @param other
|
||||
* @return
|
||||
*/
|
||||
public int compareTo(char other) {
|
||||
if (this.c > other) {
|
||||
return 1;
|
||||
}
|
||||
if (this.c < other) {
|
||||
return -1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取节点的成词状态
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public Status getStatus() {
|
||||
return status;
|
||||
}
|
||||
|
||||
protected void walk(StringBuilder sb, Set<Map.Entry<String, V>> entrySet) {
|
||||
sb.append(c);
|
||||
if (status == Status.WORD_MIDDLE_2 || status == Status.WORD_END_3) {
|
||||
entrySet.add(new TrieEntry(sb.toString(), value));
|
||||
}
|
||||
if (child == null) {
|
||||
return;
|
||||
}
|
||||
for (BaseNode node : child) {
|
||||
if (node == null) {
|
||||
continue;
|
||||
}
|
||||
node.walk(new StringBuilder(sb.toString()), entrySet);
|
||||
}
|
||||
}
|
||||
|
||||
protected void walkToSave(DataOutputStream out) throws IOException {
|
||||
out.writeChar(c);
|
||||
out.writeInt(status.ordinal());
|
||||
int childSize = 0;
|
||||
if (child != null) {
|
||||
childSize = child.length;
|
||||
}
|
||||
out.writeInt(childSize);
|
||||
if (child == null) {
|
||||
return;
|
||||
}
|
||||
for (BaseNode node : child) {
|
||||
node.walkToSave(out);
|
||||
}
|
||||
}
|
||||
|
||||
protected void walkToSave(ObjectOutput out) throws IOException {
|
||||
out.writeChar(c);
|
||||
out.writeInt(status.ordinal());
|
||||
if (status == Status.WORD_END_3 || status == Status.WORD_MIDDLE_2) {
|
||||
out.writeObject(value);
|
||||
}
|
||||
int childSize = 0;
|
||||
if (child != null) {
|
||||
childSize = child.length;
|
||||
}
|
||||
out.writeInt(childSize);
|
||||
if (child == null) {
|
||||
return;
|
||||
}
|
||||
for (BaseNode node : child) {
|
||||
node.walkToSave(out);
|
||||
}
|
||||
}
|
||||
|
||||
protected void walkToLoad(ByteArray byteArray, _ValueArray<V> valueArray) {
|
||||
c = byteArray.nextChar();
|
||||
status = ARRAY_STATUS[byteArray.nextInt()];
|
||||
if (status == Status.WORD_END_3 || status == Status.WORD_MIDDLE_2) {
|
||||
value = valueArray.nextValue();
|
||||
}
|
||||
int childSize = byteArray.nextInt();
|
||||
child = new BaseNode[childSize];
|
||||
for (int i = 0; i < childSize; ++i) {
|
||||
child[i] = new Node<V>();
|
||||
child[i].walkToLoad(byteArray, valueArray);
|
||||
}
|
||||
}
|
||||
|
||||
protected void walkToLoad(ObjectInput byteArray) throws IOException, ClassNotFoundException {
|
||||
c = byteArray.readChar();
|
||||
status = ARRAY_STATUS[byteArray.readInt()];
|
||||
if (status == Status.WORD_END_3 || status == Status.WORD_MIDDLE_2) {
|
||||
value = (V) byteArray.readObject();
|
||||
}
|
||||
int childSize = byteArray.readInt();
|
||||
child = new BaseNode[childSize];
|
||||
for (int i = 0; i < childSize; ++i) {
|
||||
child[i] = new Node<V>();
|
||||
child[i].walkToLoad(byteArray);
|
||||
}
|
||||
}
|
||||
|
||||
public enum Status {
|
||||
/**
|
||||
* 未指定,用于删除词条
|
||||
*/
|
||||
UNDEFINED_0,
|
||||
/**
|
||||
* 不是词语的结尾
|
||||
*/
|
||||
NOT_WORD_1,
|
||||
/**
|
||||
* 是个词语的结尾,并且还可以继续
|
||||
*/
|
||||
WORD_MIDDLE_2,
|
||||
/**
|
||||
* 是个词语的结尾,并且没有继续
|
||||
*/
|
||||
WORD_END_3,
|
||||
}
|
||||
|
||||
public class TrieEntry extends AbstractMap.SimpleEntry<String, V> implements Comparable<TrieEntry> {
|
||||
|
||||
public TrieEntry(String key, V value) {
|
||||
super(key, value);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int compareTo(TrieEntry o) {
|
||||
return getKey().compareTo(String.valueOf(o.getKey()));
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "BaseNode{"
|
||||
+ "child="
|
||||
+ Arrays.toString(child)
|
||||
+ ", status="
|
||||
+ status
|
||||
+ ", c="
|
||||
+ c
|
||||
+ ", value="
|
||||
+ value
|
||||
+ ", prefix='"
|
||||
+ prefix
|
||||
+ '\''
|
||||
+ '}';
|
||||
}
|
||||
|
||||
public void walkNode(Set<Map.Entry<String, V>> entrySet, Set<Long> detectModelIds) {
|
||||
if (status == Status.WORD_MIDDLE_2 || status == Status.WORD_END_3) {
|
||||
logger.debug("detectModelIds:{},before:{}", detectModelIds, value.toString());
|
||||
List natures = new LoadRemoveService().removeNatures((List) value, detectModelIds);
|
||||
String name = this.prefix != null ? this.prefix + c : "" + c;
|
||||
logger.debug("name:{},after:{},natures:{}", name, (List) value, natures);
|
||||
entrySet.add(new TrieEntry(name, (V) natures));
|
||||
}
|
||||
}
|
||||
|
||||
/***
|
||||
* walk limit
|
||||
* @param sb
|
||||
* @param entrySet
|
||||
* @param limit
|
||||
*/
|
||||
public void walkLimit(StringBuilder sb, Set<Map.Entry<String, V>> entrySet, int limit, Set<Long> detectModelIds) {
|
||||
Queue<BaseNode> queue = new ArrayDeque<>();
|
||||
this.prefix = sb.toString();
|
||||
queue.add(this);
|
||||
while (!queue.isEmpty()) {
|
||||
if (entrySet.size() >= limit) {
|
||||
break;
|
||||
}
|
||||
BaseNode root = queue.poll();
|
||||
if (root == null) {
|
||||
continue;
|
||||
}
|
||||
root.walkNode(entrySet, detectModelIds);
|
||||
if (root.child == null) {
|
||||
continue;
|
||||
}
|
||||
String prefix = root.prefix + root.c;
|
||||
for (BaseNode node : root.child) {
|
||||
if (Objects.nonNull(node)) {
|
||||
node.prefix = prefix;
|
||||
queue.add(node);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,393 @@
|
||||
package com.hankcs.hanlp.dictionary;
|
||||
|
||||
|
||||
import com.hankcs.hanlp.HanLP;
|
||||
import com.hankcs.hanlp.collection.trie.DoubleArrayTrie;
|
||||
import com.hankcs.hanlp.corpus.io.ByteArray;
|
||||
import com.hankcs.hanlp.corpus.io.IOUtil;
|
||||
import com.hankcs.hanlp.corpus.tag.Nature;
|
||||
import com.hankcs.hanlp.utility.Predefine;
|
||||
import com.hankcs.hanlp.utility.TextUtility;
|
||||
|
||||
import java.io.BufferedOutputStream;
|
||||
import java.io.BufferedReader;
|
||||
import java.io.DataOutputStream;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.Serializable;
|
||||
import java.util.Collection;
|
||||
import java.util.TreeMap;
|
||||
|
||||
/**
|
||||
* 使用DoubleArrayTrie实现的核心词典
|
||||
*/
|
||||
public class CoreDictionary {
|
||||
|
||||
public static DoubleArrayTrie<Attribute> trie = new DoubleArrayTrie<Attribute>();
|
||||
|
||||
public static final String PATH = HanLP.Config.CoreDictionaryPath;
|
||||
|
||||
// 自动加载词典
|
||||
static {
|
||||
long start = System.currentTimeMillis();
|
||||
if (!load(PATH)) {
|
||||
throw new IllegalArgumentException("核心词典" + PATH + "加载失败");
|
||||
} else {
|
||||
Predefine.logger.info(PATH + "加载成功," + trie.size() + "个词条,耗时"
|
||||
+ (System.currentTimeMillis() - start) + "ms");
|
||||
}
|
||||
}
|
||||
|
||||
// 一些特殊的WORD_ID
|
||||
public static final int NR_WORD_ID = getWordID(Predefine.TAG_PEOPLE);
|
||||
public static final int NS_WORD_ID = getWordID(Predefine.TAG_PLACE);
|
||||
public static final int NT_WORD_ID = getWordID(Predefine.TAG_GROUP);
|
||||
public static final int T_WORD_ID = getWordID(Predefine.TAG_TIME);
|
||||
public static final int X_WORD_ID = getWordID(Predefine.TAG_CLUSTER);
|
||||
public static final int M_WORD_ID = getWordID(Predefine.TAG_NUMBER);
|
||||
public static final int NX_WORD_ID = getWordID(Predefine.TAG_PROPER);
|
||||
|
||||
private static boolean load(String path) {
|
||||
Predefine.logger.info("核心词典开始加载:" + path);
|
||||
if (loadDat(path)) {
|
||||
return true;
|
||||
}
|
||||
TreeMap<String, Attribute> map = new TreeMap<String, Attribute>();
|
||||
BufferedReader br = null;
|
||||
try {
|
||||
br = new BufferedReader(new InputStreamReader(IOUtil.newInputStream(path), "UTF-8"));
|
||||
String line;
|
||||
int totalFrequency = 0;
|
||||
long start = System.currentTimeMillis();
|
||||
while ((line = br.readLine()) != null) {
|
||||
String[] param = line.split("\\s");
|
||||
int natureCount = (param.length - 1) / 2;
|
||||
Attribute attribute = new Attribute(natureCount);
|
||||
for (int i = 0; i < natureCount; ++i) {
|
||||
attribute.nature[i] = Nature.create(param[1 + 2 * i]);
|
||||
attribute.frequency[i] = Integer.parseInt(param[2 + 2 * i]);
|
||||
attribute.totalFrequency += attribute.frequency[i];
|
||||
}
|
||||
map.put(param[0], attribute);
|
||||
totalFrequency += attribute.totalFrequency;
|
||||
}
|
||||
Predefine.logger.info(
|
||||
"核心词典读入词条" + map.size() + " 全部频次" + totalFrequency + ",耗时" + (System.currentTimeMillis() - start)
|
||||
+ "ms");
|
||||
br.close();
|
||||
trie.build(map);
|
||||
Predefine.logger.info("核心词典加载成功:" + trie.size() + "个词条,下面将写入缓存……");
|
||||
try {
|
||||
DataOutputStream out = new DataOutputStream(
|
||||
new BufferedOutputStream(IOUtil.newOutputStream(path + Predefine.BIN_EXT)));
|
||||
Collection<Attribute> attributeList = map.values();
|
||||
out.writeInt(attributeList.size());
|
||||
for (Attribute attribute : attributeList) {
|
||||
out.writeInt(attribute.totalFrequency);
|
||||
out.writeInt(attribute.nature.length);
|
||||
for (int i = 0; i < attribute.nature.length; ++i) {
|
||||
out.writeInt(attribute.nature[i].ordinal());
|
||||
out.writeInt(attribute.frequency[i]);
|
||||
}
|
||||
}
|
||||
trie.save(out);
|
||||
out.writeInt(totalFrequency);
|
||||
Predefine.setTotalFrequency(totalFrequency);
|
||||
out.close();
|
||||
} catch (Exception e) {
|
||||
Predefine.logger.warning("保存失败" + e);
|
||||
return false;
|
||||
}
|
||||
} catch (FileNotFoundException e) {
|
||||
Predefine.logger.warning("核心词典" + path + "不存在!" + e);
|
||||
return false;
|
||||
} catch (IOException e) {
|
||||
Predefine.logger.warning("核心词典" + path + "读取错误!" + e);
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* 从磁盘加载双数组
|
||||
*
|
||||
* @param path
|
||||
* @return
|
||||
*/
|
||||
static boolean loadDat(String path) {
|
||||
try {
|
||||
ByteArray byteArray = ByteArray.createByteArray(path + Predefine.BIN_EXT);
|
||||
if (byteArray == null) {
|
||||
return false;
|
||||
}
|
||||
int size = byteArray.nextInt();
|
||||
Attribute[] attributes = new Attribute[size];
|
||||
final Nature[] natureIndexArray = Nature.values();
|
||||
for (int i = 0; i < size; ++i) {
|
||||
// 第一个是全部频次,第二个是词性个数
|
||||
int currentTotalFrequency = byteArray.nextInt();
|
||||
int length = byteArray.nextInt();
|
||||
attributes[i] = new Attribute(length);
|
||||
attributes[i].totalFrequency = currentTotalFrequency;
|
||||
for (int j = 0; j < length; ++j) {
|
||||
attributes[i].nature[j] = natureIndexArray[byteArray.nextInt()];
|
||||
attributes[i].frequency[j] = byteArray.nextInt();
|
||||
}
|
||||
}
|
||||
if (!trie.load(byteArray, attributes)) {
|
||||
return false;
|
||||
}
|
||||
int totalFrequency = 0;
|
||||
if (byteArray.hasMore()) {
|
||||
totalFrequency = byteArray.nextInt();
|
||||
} else {
|
||||
for (Attribute attribute : attributes) {
|
||||
totalFrequency += attribute.totalFrequency;
|
||||
}
|
||||
}
|
||||
Predefine.setTotalFrequency(totalFrequency);
|
||||
} catch (Exception e) {
|
||||
Predefine.logger.warning("读取失败,问题发生在" + e);
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取条目
|
||||
*
|
||||
* @param key
|
||||
* @return
|
||||
*/
|
||||
public static Attribute get(String key) {
|
||||
return trie.get(key);
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取条目
|
||||
*
|
||||
* @param wordID
|
||||
* @return
|
||||
*/
|
||||
public static Attribute get(int wordID) {
|
||||
return trie.get(wordID);
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取词频
|
||||
*
|
||||
* @param term
|
||||
* @return
|
||||
*/
|
||||
public static int getTermFrequency(String term) {
|
||||
Attribute attribute = get(term);
|
||||
if (attribute == null) {
|
||||
return 0;
|
||||
}
|
||||
return attribute.totalFrequency;
|
||||
}
|
||||
|
||||
/**
|
||||
* 是否包含词语
|
||||
*
|
||||
* @param key
|
||||
* @return
|
||||
*/
|
||||
public static boolean contains(String key) {
|
||||
return trie.get(key) != null;
|
||||
}
|
||||
|
||||
/**
|
||||
* 核心词典中的词属性
|
||||
*/
|
||||
public static class Attribute implements Serializable {
|
||||
|
||||
/**
|
||||
* 词性列表
|
||||
*/
|
||||
public Nature[] nature;
|
||||
/**
|
||||
* 词性对应的词频
|
||||
*/
|
||||
public int[] frequency;
|
||||
|
||||
public int totalFrequency;
|
||||
public String original = null;
|
||||
|
||||
|
||||
public Attribute(int size) {
|
||||
nature = new Nature[size];
|
||||
frequency = new int[size];
|
||||
}
|
||||
|
||||
public Attribute(Nature[] nature, int[] frequency) {
|
||||
this.nature = nature;
|
||||
this.frequency = frequency;
|
||||
}
|
||||
|
||||
public Attribute(Nature nature, int frequency) {
|
||||
this(1);
|
||||
this.nature[0] = nature;
|
||||
this.frequency[0] = frequency;
|
||||
totalFrequency = frequency;
|
||||
}
|
||||
|
||||
public Attribute(Nature[] nature, int[] frequency, int totalFrequency) {
|
||||
this.nature = nature;
|
||||
this.frequency = frequency;
|
||||
this.totalFrequency = totalFrequency;
|
||||
}
|
||||
|
||||
/**
|
||||
* 使用单个词性,默认词频1000构造
|
||||
*
|
||||
* @param nature
|
||||
*/
|
||||
public Attribute(Nature nature) {
|
||||
this(nature, 1000);
|
||||
}
|
||||
|
||||
public static Attribute create(String natureWithFrequency) {
|
||||
try {
|
||||
String[] param = natureWithFrequency.split(" ");
|
||||
if (param.length % 2 != 0) {
|
||||
return new Attribute(Nature.create(natureWithFrequency.trim()), 1); // 儿童锁
|
||||
}
|
||||
int natureCount = param.length / 2;
|
||||
Attribute attribute = new Attribute(natureCount);
|
||||
for (int i = 0; i < natureCount; ++i) {
|
||||
attribute.nature[i] = Nature.create(param[2 * i]);
|
||||
attribute.frequency[i] = Integer.parseInt(param[1 + 2 * i]);
|
||||
attribute.totalFrequency += attribute.frequency[i];
|
||||
}
|
||||
return attribute;
|
||||
} catch (Exception e) {
|
||||
Predefine.logger.warning("使用字符串" + natureWithFrequency + "创建词条属性失败!"
|
||||
+ TextUtility.exceptionToString(e));
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 从字节流中加载
|
||||
*
|
||||
* @param byteArray
|
||||
* @param natureIndexArray
|
||||
* @return
|
||||
*/
|
||||
public static Attribute create(ByteArray byteArray, Nature[] natureIndexArray) {
|
||||
int currentTotalFrequency = byteArray.nextInt();
|
||||
int length = byteArray.nextInt();
|
||||
Attribute attribute = new Attribute(length);
|
||||
attribute.totalFrequency = currentTotalFrequency;
|
||||
for (int j = 0; j < length; ++j) {
|
||||
attribute.nature[j] = natureIndexArray[byteArray.nextInt()];
|
||||
attribute.frequency[j] = byteArray.nextInt();
|
||||
}
|
||||
|
||||
return attribute;
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取词性的词频
|
||||
*
|
||||
* @param nature 字符串词性
|
||||
* @return 词频
|
||||
* @deprecated 推荐使用Nature参数!
|
||||
*/
|
||||
public int getNatureFrequency(String nature) {
|
||||
try {
|
||||
Nature pos = Nature.create(nature);
|
||||
return getNatureFrequency(pos);
|
||||
} catch (IllegalArgumentException e) {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取词性的词频
|
||||
*
|
||||
* @param nature 词性
|
||||
* @return 词频
|
||||
*/
|
||||
public int getNatureFrequency(final Nature nature) {
|
||||
int i = 0;
|
||||
for (Nature pos : this.nature) {
|
||||
if (nature == pos) {
|
||||
return frequency[i];
|
||||
}
|
||||
++i;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* 是否有某个词性
|
||||
*
|
||||
* @param nature
|
||||
* @return
|
||||
*/
|
||||
public boolean hasNature(Nature nature) {
|
||||
return getNatureFrequency(nature) > 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* 是否有以某个前缀开头的词性
|
||||
*
|
||||
* @param prefix 词性前缀,比如u会查询是否有ude, uzhe等等
|
||||
* @return
|
||||
*/
|
||||
public boolean hasNatureStartsWith(String prefix) {
|
||||
for (Nature n : nature) {
|
||||
if (n.startsWith(prefix)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
final StringBuilder sb = new StringBuilder();
|
||||
for (int i = 0; i < nature.length; ++i) {
|
||||
sb.append(nature[i]).append(' ').append(frequency[i]).append(' ');
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
public void save(DataOutputStream out) throws IOException {
|
||||
out.writeInt(totalFrequency);
|
||||
out.writeInt(nature.length);
|
||||
for (int i = 0; i < nature.length; ++i) {
|
||||
out.writeInt(nature[i].ordinal());
|
||||
out.writeInt(frequency[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取词语的ID
|
||||
*
|
||||
* @param a 词语
|
||||
* @return ID, 如果不存在, 则返回-1
|
||||
*/
|
||||
public static int getWordID(String a) {
|
||||
return CoreDictionary.trie.exactMatchSearch(a);
|
||||
}
|
||||
|
||||
/**
|
||||
* 热更新核心词典<br>
|
||||
* 集群环境(或其他IOAdapter)需要自行删除缓存文件
|
||||
*
|
||||
* @return 是否成功
|
||||
*/
|
||||
public static boolean reload() {
|
||||
String path = CoreDictionary.PATH;
|
||||
IOUtil.deleteFile(path + Predefine.BIN_EXT);
|
||||
|
||||
return load(path);
|
||||
}
|
||||
}
|
||||
|
||||
342
common/src/main/java/com/hankcs/hanlp/seg/WordBasedSegment.java
Normal file
342
common/src/main/java/com/hankcs/hanlp/seg/WordBasedSegment.java
Normal file
@@ -0,0 +1,342 @@
|
||||
package com.hankcs.hanlp.seg;
|
||||
|
||||
|
||||
import com.hankcs.hanlp.algorithm.Viterbi;
|
||||
import com.hankcs.hanlp.collection.AhoCorasick.AhoCorasickDoubleArrayTrie;
|
||||
import com.hankcs.hanlp.collection.trie.DoubleArrayTrie;
|
||||
import com.hankcs.hanlp.corpus.tag.Nature;
|
||||
import com.hankcs.hanlp.dictionary.CoreDictionary;
|
||||
import com.hankcs.hanlp.seg.common.Term;
|
||||
import com.hankcs.hanlp.dictionary.CoreDictionaryTransformMatrixDictionary;
|
||||
import com.hankcs.hanlp.dictionary.other.CharType;
|
||||
import com.hankcs.hanlp.seg.NShort.Path.AtomNode;
|
||||
import com.hankcs.hanlp.seg.common.Graph;
|
||||
import com.hankcs.hanlp.seg.common.Vertex;
|
||||
import com.hankcs.hanlp.seg.common.WordNet;
|
||||
import com.hankcs.hanlp.utility.TextUtility;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.ListIterator;
|
||||
|
||||
|
||||
public abstract class WordBasedSegment extends Segment {
|
||||
|
||||
public WordBasedSegment() {
|
||||
}
|
||||
|
||||
protected static void generateWord(List<Vertex> linkedArray, WordNet wordNetOptimum) {
|
||||
fixResultByRule(linkedArray);
|
||||
wordNetOptimum.addAll(linkedArray);
|
||||
}
|
||||
|
||||
protected static void fixResultByRule(List<Vertex> linkedArray) {
|
||||
mergeContinueNumIntoOne(linkedArray);
|
||||
changeDelimiterPOS(linkedArray);
|
||||
splitMiddleSlashFromDigitalWords(linkedArray);
|
||||
checkDateElements(linkedArray);
|
||||
}
|
||||
|
||||
static void changeDelimiterPOS(List<Vertex> linkedArray) {
|
||||
Iterator var1 = linkedArray.iterator();
|
||||
|
||||
while (true) {
|
||||
Vertex vertex;
|
||||
do {
|
||||
if (!var1.hasNext()) {
|
||||
return;
|
||||
}
|
||||
|
||||
vertex = (Vertex) var1.next();
|
||||
} while (!vertex.realWord.equals("--") && !vertex.realWord.equals("—") && !vertex.realWord.equals("-"));
|
||||
|
||||
vertex.confirmNature(Nature.w);
|
||||
}
|
||||
}
|
||||
|
||||
private static void splitMiddleSlashFromDigitalWords(List<Vertex> linkedArray) {
|
||||
if (linkedArray.size() >= 2) {
|
||||
ListIterator<Vertex> listIterator = linkedArray.listIterator();
|
||||
Vertex next = (Vertex) listIterator.next();
|
||||
|
||||
for (Vertex current = next; listIterator.hasNext(); current = next) {
|
||||
next = (Vertex) listIterator.next();
|
||||
Nature currentNature = current.getNature();
|
||||
if (currentNature == Nature.nx && (next.hasNature(Nature.q) || next.hasNature(Nature.n))) {
|
||||
String[] param = current.realWord.split("-", 1);
|
||||
if (param.length == 2 && TextUtility.isAllNum(param[0]) && TextUtility.isAllNum(param[1])) {
|
||||
current = current.copy();
|
||||
current.realWord = param[0];
|
||||
current.confirmNature(Nature.m);
|
||||
listIterator.previous();
|
||||
listIterator.previous();
|
||||
listIterator.set(current);
|
||||
listIterator.next();
|
||||
listIterator.add(Vertex.newPunctuationInstance("-"));
|
||||
listIterator.add(Vertex.newNumberInstance(param[1]));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
private static void checkDateElements(List<Vertex> linkedArray) {
|
||||
if (linkedArray.size() >= 2) {
|
||||
ListIterator<Vertex> listIterator = linkedArray.listIterator();
|
||||
Vertex next = (Vertex) listIterator.next();
|
||||
|
||||
for (Vertex current = next; listIterator.hasNext(); current = next) {
|
||||
next = (Vertex) listIterator.next();
|
||||
if (TextUtility.isAllNum(current.realWord) || TextUtility.isAllChineseNum(current.realWord)) {
|
||||
String nextWord = next.realWord;
|
||||
if (nextWord.length() == 1 && "月日时分秒".contains(nextWord)
|
||||
|| nextWord.length() == 2 && nextWord.equals("月份")) {
|
||||
mergeDate(listIterator, next, current);
|
||||
} else if (nextWord.equals("年")) {
|
||||
if (TextUtility.isYearTime(current.realWord)) {
|
||||
mergeDate(listIterator, next, current);
|
||||
} else {
|
||||
current.confirmNature(Nature.m);
|
||||
}
|
||||
} else if (current.realWord.endsWith("点")) {
|
||||
current.confirmNature(Nature.t, true);
|
||||
} else {
|
||||
char[] tmpCharArray = current.realWord.toCharArray();
|
||||
String lastChar = String.valueOf(tmpCharArray[tmpCharArray.length - 1]);
|
||||
if (!"∶·././".contains(lastChar)) {
|
||||
current.confirmNature(Nature.m, true);
|
||||
} else if (current.realWord.length() > 1) {
|
||||
char last = current.realWord.charAt(current.realWord.length() - 1);
|
||||
current = Vertex.newNumberInstance(
|
||||
current.realWord.substring(0, current.realWord.length() - 1));
|
||||
listIterator.previous();
|
||||
listIterator.previous();
|
||||
listIterator.set(current);
|
||||
listIterator.next();
|
||||
listIterator.add(Vertex.newPunctuationInstance(String.valueOf(last)));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
private static void mergeDate(ListIterator<Vertex> listIterator, Vertex next, Vertex current) {
|
||||
current = Vertex.newTimeInstance(current.realWord + next.realWord);
|
||||
listIterator.previous();
|
||||
listIterator.previous();
|
||||
listIterator.set(current);
|
||||
listIterator.next();
|
||||
listIterator.next();
|
||||
listIterator.remove();
|
||||
}
|
||||
|
||||
protected static List<Term> convert(List<Vertex> vertexList) {
|
||||
return Segment.convert(vertexList, false);
|
||||
}
|
||||
|
||||
protected static Graph generateBiGraph(WordNet wordNet) {
|
||||
return wordNet.toGraph();
|
||||
}
|
||||
|
||||
/**
|
||||
* @deprecated
|
||||
*/
|
||||
private static List<AtomNode> atomSegment(String sSentence, int start, int end) {
|
||||
if (end < start) {
|
||||
throw new RuntimeException("start=" + start + " < end=" + end);
|
||||
} else {
|
||||
List<AtomNode> atomSegment = new ArrayList();
|
||||
int pCur = 0;
|
||||
StringBuilder sb = new StringBuilder();
|
||||
char[] charArray = sSentence.substring(start, end).toCharArray();
|
||||
int[] charTypeArray = new int[charArray.length];
|
||||
|
||||
for (int i = 0; i < charArray.length; ++i) {
|
||||
char c = charArray[i];
|
||||
charTypeArray[i] = CharType.get(c);
|
||||
if (c == '.' && i < charArray.length - 1 && CharType.get(charArray[i + 1]) == 9) {
|
||||
charTypeArray[i] = 9;
|
||||
} else if (c == '.' && i < charArray.length - 1 && charArray[i + 1] >= '0' && charArray[i + 1] <= '9') {
|
||||
charTypeArray[i] = 5;
|
||||
} else if (charTypeArray[i] == 8) {
|
||||
charTypeArray[i] = 5;
|
||||
}
|
||||
}
|
||||
|
||||
while (true) {
|
||||
while (true) {
|
||||
while (pCur < charArray.length) {
|
||||
int nCurType = charTypeArray[pCur];
|
||||
if (nCurType != 7 && nCurType != 10 && nCurType != 6 && nCurType != 17) {
|
||||
if (pCur < charArray.length - 1 && (nCurType == 5 || nCurType == 9)) {
|
||||
sb.delete(0, sb.length());
|
||||
sb.append(charArray[pCur]);
|
||||
boolean reachEnd = true;
|
||||
|
||||
while (pCur < charArray.length - 1) {
|
||||
++pCur;
|
||||
int nNextType = charTypeArray[pCur];
|
||||
if (nNextType != nCurType) {
|
||||
reachEnd = false;
|
||||
break;
|
||||
}
|
||||
|
||||
sb.append(charArray[pCur]);
|
||||
}
|
||||
|
||||
atomSegment.add(new AtomNode(sb.toString(), nCurType));
|
||||
if (reachEnd) {
|
||||
++pCur;
|
||||
}
|
||||
} else {
|
||||
atomSegment.add(new AtomNode(charArray[pCur], nCurType));
|
||||
++pCur;
|
||||
}
|
||||
} else {
|
||||
String single = String.valueOf(charArray[pCur]);
|
||||
if (single.length() != 0) {
|
||||
atomSegment.add(new AtomNode(single, nCurType));
|
||||
}
|
||||
|
||||
++pCur;
|
||||
}
|
||||
}
|
||||
|
||||
return atomSegment;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static void mergeContinueNumIntoOne(List<Vertex> linkedArray) {
|
||||
if (linkedArray.size() >= 2) {
|
||||
ListIterator<Vertex> listIterator = linkedArray.listIterator();
|
||||
Vertex next = (Vertex) listIterator.next();
|
||||
Vertex current = next;
|
||||
|
||||
while (true) {
|
||||
while (listIterator.hasNext()) {
|
||||
next = (Vertex) listIterator.next();
|
||||
if (!TextUtility.isAllNum(current.realWord) && !TextUtility.isAllChineseNum(current.realWord)
|
||||
|| !TextUtility.isAllNum(next.realWord) && !TextUtility.isAllChineseNum(next.realWord)) {
|
||||
current = next;
|
||||
} else {
|
||||
current = Vertex.newNumberInstance(current.realWord + next.realWord);
|
||||
listIterator.previous();
|
||||
listIterator.previous();
|
||||
listIterator.set(current);
|
||||
listIterator.next();
|
||||
listIterator.next();
|
||||
listIterator.remove();
|
||||
}
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
protected void generateWordNet(final WordNet wordNetStorage) {
|
||||
final char[] charArray = wordNetStorage.charArray;
|
||||
DoubleArrayTrie.Searcher searcher = CoreDictionary.trie.getSearcher(charArray, 0);
|
||||
|
||||
while (searcher.next()) {
|
||||
wordNetStorage.add(searcher.begin + 1, new Vertex(new String(charArray, searcher.begin, searcher.length),
|
||||
(CoreDictionary.Attribute) searcher.value, searcher.index));
|
||||
}
|
||||
|
||||
if (this.config.forceCustomDictionary) {
|
||||
this.customDictionary.parseText(charArray, new AhoCorasickDoubleArrayTrie.IHit<CoreDictionary.Attribute>() {
|
||||
public void hit(int begin, int end, CoreDictionary.Attribute value) {
|
||||
wordNetStorage.add(begin + 1, new Vertex(new String(charArray, begin, end - begin), value));
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
LinkedList<Vertex>[] vertexes = wordNetStorage.getVertexes();
|
||||
int i = 1;
|
||||
|
||||
while (true) {
|
||||
while (i < vertexes.length) {
|
||||
if (vertexes[i].isEmpty()) {
|
||||
int j;
|
||||
for (j = i + 1;
|
||||
j < vertexes.length - 1 && (vertexes[j].isEmpty() || CharType.get(charArray[j - 1]) == 11);
|
||||
++j) {
|
||||
}
|
||||
|
||||
wordNetStorage.add(i, Segment.quickAtomSegment(charArray, i - 1, j - 1));
|
||||
i = j;
|
||||
} else {
|
||||
i += ((Vertex) vertexes[i].getLast()).realWord.length();
|
||||
}
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
protected List<Term> decorateResultForIndexMode(List<Vertex> vertexList, WordNet wordNetAll) {
|
||||
List<Term> termList = new LinkedList();
|
||||
int line = 1;
|
||||
ListIterator<Vertex> listIterator = vertexList.listIterator();
|
||||
listIterator.next();
|
||||
int length = vertexList.size() - 2;
|
||||
|
||||
for (int i = 0; i < length; ++i) {
|
||||
Vertex vertex = (Vertex) listIterator.next();
|
||||
Term termMain = Segment.convert(vertex);
|
||||
//termList.add(termMain);
|
||||
addTerms(termList, vertex, line - 1);
|
||||
termMain.offset = line - 1;
|
||||
if (vertex.realWord.length() > 2) {
|
||||
label43:
|
||||
for (int currentLine = line; currentLine < line + vertex.realWord.length(); ++currentLine) {
|
||||
Iterator iterator = wordNetAll.descendingIterator(currentLine);
|
||||
|
||||
while (true) {
|
||||
Vertex smallVertex;
|
||||
do {
|
||||
if (!iterator.hasNext()) {
|
||||
continue label43;
|
||||
}
|
||||
smallVertex = (Vertex) iterator.next();
|
||||
} while ((termMain.nature != Nature.mq || !smallVertex.hasNature(Nature.q))
|
||||
&& smallVertex.realWord.length() < this.config.indexMode);
|
||||
|
||||
if (smallVertex != vertex
|
||||
&& currentLine + smallVertex.realWord.length() <= line + vertex.realWord.length()) {
|
||||
listIterator.add(smallVertex);
|
||||
//Term termSub = convert(smallVertex);
|
||||
//termSub.offset = currentLine - 1;
|
||||
//termList.add(termSub);
|
||||
addTerms(termList, smallVertex, currentLine - 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
line += vertex.realWord.length();
|
||||
}
|
||||
|
||||
return termList;
|
||||
}
|
||||
|
||||
protected static void speechTagging(List<Vertex> vertexList) {
|
||||
Viterbi.compute(vertexList, CoreDictionaryTransformMatrixDictionary.transformMatrixDictionary);
|
||||
}
|
||||
|
||||
protected void addTerms(List<Term> terms, Vertex vertex, int offset) {
|
||||
for (int i = 0; i < vertex.attribute.nature.length; i++) {
|
||||
Term term = new Term(vertex.realWord, vertex.attribute.nature[i]);
|
||||
term.setFrequency(vertex.attribute.frequency[i]);
|
||||
term.offset = offset;
|
||||
terms.add(term);
|
||||
}
|
||||
}
|
||||
}
|
||||
76
common/src/main/java/com/hankcs/hanlp/seg/common/Term.java
Normal file
76
common/src/main/java/com/hankcs/hanlp/seg/common/Term.java
Normal file
@@ -0,0 +1,76 @@
|
||||
package com.hankcs.hanlp.seg.common;
|
||||
|
||||
import com.hankcs.hanlp.corpus.tag.Nature;
|
||||
//import com.hankcs.hanlp.dictionary.CoreDictionary;
|
||||
//import com.hankcs.hanlp.dictionary.CustomDictionary;
|
||||
//import com.hankcs.hanlp.dictionary.DynamicCustomDictionary;
|
||||
import lombok.Data;
|
||||
import lombok.ToString;
|
||||
|
||||
//import static com.hankcs.hanlp.HanLP.Config.CustomDictionaryPath;
|
||||
|
||||
@Data
|
||||
@ToString
|
||||
public class Term {
|
||||
|
||||
public String word;
|
||||
|
||||
public Nature nature;
|
||||
public int offset;
|
||||
public int frequency = 0;
|
||||
|
||||
public Term(String word, Nature nature) {
|
||||
this.word = word;
|
||||
this.nature = nature;
|
||||
}
|
||||
|
||||
public Term(String word, Nature nature, int offset) {
|
||||
this.word = word;
|
||||
this.nature = nature;
|
||||
this.offset = offset;
|
||||
}
|
||||
|
||||
public Term(String word, Nature nature, int offset, int frequency) {
|
||||
this.word = word;
|
||||
this.nature = nature;
|
||||
this.offset = offset;
|
||||
this.frequency = frequency;
|
||||
}
|
||||
|
||||
public int length() {
|
||||
return this.word.length();
|
||||
}
|
||||
|
||||
public int getFrequency() {
|
||||
if (frequency > 0) {
|
||||
return frequency;
|
||||
}
|
||||
// todo opt
|
||||
/*
|
||||
String wordOri = word.toLowerCase();
|
||||
CoreDictionary.Attribute attribute = getDynamicCustomDictionary().get(wordOri);
|
||||
if (attribute == null) {
|
||||
attribute = CoreDictionary.get(wordOri);
|
||||
if (attribute == null) {
|
||||
attribute = CustomDictionary.get(wordOri);
|
||||
}
|
||||
}
|
||||
if (attribute != null && nature != null && attribute.hasNature(nature)) {
|
||||
return attribute.getNatureFrequency(nature);
|
||||
}
|
||||
return attribute == null ? 0 : attribute.totalFrequency;
|
||||
*/
|
||||
return 0;
|
||||
}
|
||||
|
||||
public boolean equals(Object obj) {
|
||||
if (obj instanceof Term) {
|
||||
Term term = (Term) obj;
|
||||
if (this.nature == term.nature && this.word.equals(term.word)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return super.equals(obj);
|
||||
}
|
||||
|
||||
}
|
||||
Reference in New Issue
Block a user