headless integrates knowledge (#722)

This commit is contained in:
daikon
2024-02-05 20:30:57 +08:00
committed by GitHub
parent 74d0ec2b23
commit 9600456bae
174 changed files with 1908 additions and 1817 deletions

View File

@@ -190,6 +190,11 @@
<groupId>dev.langchain4j</groupId>
<artifactId>langchain4j-embeddings</artifactId>
</dependency>
<dependency>
<groupId>com.hankcs</groupId>
<artifactId>hanlp</artifactId>
<version>${hanlp.version}</version>
</dependency>
</dependencies>

View File

@@ -0,0 +1,63 @@
package com.hankcs.hanlp;
import com.tencent.supersonic.common.pojo.enums.DictWordType;
import lombok.Data;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.util.CollectionUtils;
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
import java.util.Set;
@Data
@Slf4j
public class LoadRemoveService {
@Value("${mapper.remove.nature.prefix:}")
private String mapperRemoveNaturePrefix;
public List removeNatures(List value, Set<Long> detectModelIds) {
if (CollectionUtils.isEmpty(value)) {
return value;
}
List<String> resultList = new ArrayList<>(value);
if (!CollectionUtils.isEmpty(detectModelIds)) {
resultList.removeIf(nature -> {
if (Objects.isNull(nature)) {
return false;
}
Long modelId = getViewId(nature);
if (Objects.nonNull(modelId)) {
return !detectModelIds.contains(modelId);
}
return false;
});
}
if (StringUtils.isNotBlank(mapperRemoveNaturePrefix)) {
resultList.removeIf(nature -> {
if (Objects.isNull(nature)) {
return false;
}
return nature.startsWith(mapperRemoveNaturePrefix);
});
}
return resultList;
}
public Long getViewId(String nature) {
try {
String[] split = nature.split(DictWordType.NATURE_SPILT);
if (split.length <= 1) {
return null;
}
return Long.valueOf(split[1]);
} catch (NumberFormatException e) {
log.error("", e);
}
return null;
}
}

View File

@@ -0,0 +1,331 @@
package com.hankcs.hanlp.collection.trie.bintrie;
import com.hankcs.hanlp.LoadRemoveService;
import com.hankcs.hanlp.corpus.io.ByteArray;
import java.io.DataOutputStream;
import java.io.IOException;
import java.io.ObjectInput;
import java.io.ObjectOutput;
import java.util.AbstractMap;
import java.util.ArrayDeque;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Queue;
import java.util.Set;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public abstract class BaseNode<V> implements Comparable<BaseNode> {
/**
* 状态数组,方便读取的时候用
*/
static final Status[] ARRAY_STATUS = Status.values();
private static final Logger logger = LoggerFactory.getLogger(BaseNode.class);
/**
* 子节点
*/
protected BaseNode[] child;
/**
* 节点状态
*/
protected Status status;
/**
* 节点代表的字符
*/
protected char c;
/**
* 节点代表的值
*/
protected V value;
protected String prefix = null;
public BaseNode<V> transition(String path, int begin) {
BaseNode<V> cur = this;
for (int i = begin; i < path.length(); ++i) {
cur = cur.getChild(path.charAt(i));
if (cur == null || cur.status == Status.UNDEFINED_0) {
return null;
}
}
return cur;
}
public BaseNode<V> transition(char[] path, int begin) {
BaseNode<V> cur = this;
for (int i = begin; i < path.length; ++i) {
cur = cur.getChild(path[i]);
if (cur == null || cur.status == Status.UNDEFINED_0) {
return null;
}
}
return cur;
}
/**
* 转移状态
*
* @param path
* @return
*/
public BaseNode<V> transition(char path) {
BaseNode<V> cur = this;
cur = cur.getChild(path);
if (cur == null || cur.status == Status.UNDEFINED_0) {
return null;
}
return cur;
}
/**
* 添加子节点
*
* @return true-新增了节点 false-修改了现有节点
*/
protected abstract boolean addChild(BaseNode node);
/**
* 是否含有子节点
*
* @param c 子节点的char
* @return 是否含有
*/
protected boolean hasChild(char c) {
return getChild(c) != null;
}
protected char getChar() {
return c;
}
/**
* 获取子节点
*
* @param c 子节点的char
* @return 子节点
*/
public abstract BaseNode getChild(char c);
/**
* 获取节点对应的值
*
* @return 值
*/
public final V getValue() {
return value;
}
/**
* 设置节点对应的值
*
* @param value 值
*/
public final void setValue(V value) {
this.value = value;
}
@Override
public int compareTo(BaseNode other) {
return compareTo(other.getChar());
}
/**
* 重载,与字符的比较
*
* @param other
* @return
*/
public int compareTo(char other) {
if (this.c > other) {
return 1;
}
if (this.c < other) {
return -1;
}
return 0;
}
/**
* 获取节点的成词状态
*
* @return
*/
public Status getStatus() {
return status;
}
protected void walk(StringBuilder sb, Set<Map.Entry<String, V>> entrySet) {
sb.append(c);
if (status == Status.WORD_MIDDLE_2 || status == Status.WORD_END_3) {
entrySet.add(new TrieEntry(sb.toString(), value));
}
if (child == null) {
return;
}
for (BaseNode node : child) {
if (node == null) {
continue;
}
node.walk(new StringBuilder(sb.toString()), entrySet);
}
}
protected void walkToSave(DataOutputStream out) throws IOException {
out.writeChar(c);
out.writeInt(status.ordinal());
int childSize = 0;
if (child != null) {
childSize = child.length;
}
out.writeInt(childSize);
if (child == null) {
return;
}
for (BaseNode node : child) {
node.walkToSave(out);
}
}
protected void walkToSave(ObjectOutput out) throws IOException {
out.writeChar(c);
out.writeInt(status.ordinal());
if (status == Status.WORD_END_3 || status == Status.WORD_MIDDLE_2) {
out.writeObject(value);
}
int childSize = 0;
if (child != null) {
childSize = child.length;
}
out.writeInt(childSize);
if (child == null) {
return;
}
for (BaseNode node : child) {
node.walkToSave(out);
}
}
protected void walkToLoad(ByteArray byteArray, _ValueArray<V> valueArray) {
c = byteArray.nextChar();
status = ARRAY_STATUS[byteArray.nextInt()];
if (status == Status.WORD_END_3 || status == Status.WORD_MIDDLE_2) {
value = valueArray.nextValue();
}
int childSize = byteArray.nextInt();
child = new BaseNode[childSize];
for (int i = 0; i < childSize; ++i) {
child[i] = new Node<V>();
child[i].walkToLoad(byteArray, valueArray);
}
}
protected void walkToLoad(ObjectInput byteArray) throws IOException, ClassNotFoundException {
c = byteArray.readChar();
status = ARRAY_STATUS[byteArray.readInt()];
if (status == Status.WORD_END_3 || status == Status.WORD_MIDDLE_2) {
value = (V) byteArray.readObject();
}
int childSize = byteArray.readInt();
child = new BaseNode[childSize];
for (int i = 0; i < childSize; ++i) {
child[i] = new Node<V>();
child[i].walkToLoad(byteArray);
}
}
public enum Status {
/**
* 未指定,用于删除词条
*/
UNDEFINED_0,
/**
* 不是词语的结尾
*/
NOT_WORD_1,
/**
* 是个词语的结尾,并且还可以继续
*/
WORD_MIDDLE_2,
/**
* 是个词语的结尾,并且没有继续
*/
WORD_END_3,
}
public class TrieEntry extends AbstractMap.SimpleEntry<String, V> implements Comparable<TrieEntry> {
public TrieEntry(String key, V value) {
super(key, value);
}
@Override
public int compareTo(TrieEntry o) {
return getKey().compareTo(String.valueOf(o.getKey()));
}
}
@Override
public String toString() {
return "BaseNode{"
+ "child="
+ Arrays.toString(child)
+ ", status="
+ status
+ ", c="
+ c
+ ", value="
+ value
+ ", prefix='"
+ prefix
+ '\''
+ '}';
}
public void walkNode(Set<Map.Entry<String, V>> entrySet, Set<Long> detectModelIds) {
if (status == Status.WORD_MIDDLE_2 || status == Status.WORD_END_3) {
logger.debug("detectModelIds:{},before:{}", detectModelIds, value.toString());
List natures = new LoadRemoveService().removeNatures((List) value, detectModelIds);
String name = this.prefix != null ? this.prefix + c : "" + c;
logger.debug("name:{},after:{},natures:{}", name, (List) value, natures);
entrySet.add(new TrieEntry(name, (V) natures));
}
}
/***
* walk limit
* @param sb
* @param entrySet
* @param limit
*/
public void walkLimit(StringBuilder sb, Set<Map.Entry<String, V>> entrySet, int limit, Set<Long> detectModelIds) {
Queue<BaseNode> queue = new ArrayDeque<>();
this.prefix = sb.toString();
queue.add(this);
while (!queue.isEmpty()) {
if (entrySet.size() >= limit) {
break;
}
BaseNode root = queue.poll();
if (root == null) {
continue;
}
root.walkNode(entrySet, detectModelIds);
if (root.child == null) {
continue;
}
String prefix = root.prefix + root.c;
for (BaseNode node : root.child) {
if (Objects.nonNull(node)) {
node.prefix = prefix;
queue.add(node);
}
}
}
}
}

View File

@@ -0,0 +1,393 @@
package com.hankcs.hanlp.dictionary;
import com.hankcs.hanlp.HanLP;
import com.hankcs.hanlp.collection.trie.DoubleArrayTrie;
import com.hankcs.hanlp.corpus.io.ByteArray;
import com.hankcs.hanlp.corpus.io.IOUtil;
import com.hankcs.hanlp.corpus.tag.Nature;
import com.hankcs.hanlp.utility.Predefine;
import com.hankcs.hanlp.utility.TextUtility;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.DataOutputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Serializable;
import java.util.Collection;
import java.util.TreeMap;
/**
* 使用DoubleArrayTrie实现的核心词典
*/
public class CoreDictionary {
public static DoubleArrayTrie<Attribute> trie = new DoubleArrayTrie<Attribute>();
public static final String PATH = HanLP.Config.CoreDictionaryPath;
// 自动加载词典
static {
long start = System.currentTimeMillis();
if (!load(PATH)) {
throw new IllegalArgumentException("核心词典" + PATH + "加载失败");
} else {
Predefine.logger.info(PATH + "加载成功," + trie.size() + "个词条,耗时"
+ (System.currentTimeMillis() - start) + "ms");
}
}
// 一些特殊的WORD_ID
public static final int NR_WORD_ID = getWordID(Predefine.TAG_PEOPLE);
public static final int NS_WORD_ID = getWordID(Predefine.TAG_PLACE);
public static final int NT_WORD_ID = getWordID(Predefine.TAG_GROUP);
public static final int T_WORD_ID = getWordID(Predefine.TAG_TIME);
public static final int X_WORD_ID = getWordID(Predefine.TAG_CLUSTER);
public static final int M_WORD_ID = getWordID(Predefine.TAG_NUMBER);
public static final int NX_WORD_ID = getWordID(Predefine.TAG_PROPER);
private static boolean load(String path) {
Predefine.logger.info("核心词典开始加载:" + path);
if (loadDat(path)) {
return true;
}
TreeMap<String, Attribute> map = new TreeMap<String, Attribute>();
BufferedReader br = null;
try {
br = new BufferedReader(new InputStreamReader(IOUtil.newInputStream(path), "UTF-8"));
String line;
int totalFrequency = 0;
long start = System.currentTimeMillis();
while ((line = br.readLine()) != null) {
String[] param = line.split("\\s");
int natureCount = (param.length - 1) / 2;
Attribute attribute = new Attribute(natureCount);
for (int i = 0; i < natureCount; ++i) {
attribute.nature[i] = Nature.create(param[1 + 2 * i]);
attribute.frequency[i] = Integer.parseInt(param[2 + 2 * i]);
attribute.totalFrequency += attribute.frequency[i];
}
map.put(param[0], attribute);
totalFrequency += attribute.totalFrequency;
}
Predefine.logger.info(
"核心词典读入词条" + map.size() + " 全部频次" + totalFrequency + ",耗时" + (System.currentTimeMillis() - start)
+ "ms");
br.close();
trie.build(map);
Predefine.logger.info("核心词典加载成功:" + trie.size() + "个词条,下面将写入缓存……");
try {
DataOutputStream out = new DataOutputStream(
new BufferedOutputStream(IOUtil.newOutputStream(path + Predefine.BIN_EXT)));
Collection<Attribute> attributeList = map.values();
out.writeInt(attributeList.size());
for (Attribute attribute : attributeList) {
out.writeInt(attribute.totalFrequency);
out.writeInt(attribute.nature.length);
for (int i = 0; i < attribute.nature.length; ++i) {
out.writeInt(attribute.nature[i].ordinal());
out.writeInt(attribute.frequency[i]);
}
}
trie.save(out);
out.writeInt(totalFrequency);
Predefine.setTotalFrequency(totalFrequency);
out.close();
} catch (Exception e) {
Predefine.logger.warning("保存失败" + e);
return false;
}
} catch (FileNotFoundException e) {
Predefine.logger.warning("核心词典" + path + "不存在!" + e);
return false;
} catch (IOException e) {
Predefine.logger.warning("核心词典" + path + "读取错误!" + e);
return false;
}
return true;
}
/**
* 从磁盘加载双数组
*
* @param path
* @return
*/
static boolean loadDat(String path) {
try {
ByteArray byteArray = ByteArray.createByteArray(path + Predefine.BIN_EXT);
if (byteArray == null) {
return false;
}
int size = byteArray.nextInt();
Attribute[] attributes = new Attribute[size];
final Nature[] natureIndexArray = Nature.values();
for (int i = 0; i < size; ++i) {
// 第一个是全部频次,第二个是词性个数
int currentTotalFrequency = byteArray.nextInt();
int length = byteArray.nextInt();
attributes[i] = new Attribute(length);
attributes[i].totalFrequency = currentTotalFrequency;
for (int j = 0; j < length; ++j) {
attributes[i].nature[j] = natureIndexArray[byteArray.nextInt()];
attributes[i].frequency[j] = byteArray.nextInt();
}
}
if (!trie.load(byteArray, attributes)) {
return false;
}
int totalFrequency = 0;
if (byteArray.hasMore()) {
totalFrequency = byteArray.nextInt();
} else {
for (Attribute attribute : attributes) {
totalFrequency += attribute.totalFrequency;
}
}
Predefine.setTotalFrequency(totalFrequency);
} catch (Exception e) {
Predefine.logger.warning("读取失败,问题发生在" + e);
return false;
}
return true;
}
/**
* 获取条目
*
* @param key
* @return
*/
public static Attribute get(String key) {
return trie.get(key);
}
/**
* 获取条目
*
* @param wordID
* @return
*/
public static Attribute get(int wordID) {
return trie.get(wordID);
}
/**
* 获取词频
*
* @param term
* @return
*/
public static int getTermFrequency(String term) {
Attribute attribute = get(term);
if (attribute == null) {
return 0;
}
return attribute.totalFrequency;
}
/**
* 是否包含词语
*
* @param key
* @return
*/
public static boolean contains(String key) {
return trie.get(key) != null;
}
/**
* 核心词典中的词属性
*/
public static class Attribute implements Serializable {
/**
* 词性列表
*/
public Nature[] nature;
/**
* 词性对应的词频
*/
public int[] frequency;
public int totalFrequency;
public String original = null;
public Attribute(int size) {
nature = new Nature[size];
frequency = new int[size];
}
public Attribute(Nature[] nature, int[] frequency) {
this.nature = nature;
this.frequency = frequency;
}
public Attribute(Nature nature, int frequency) {
this(1);
this.nature[0] = nature;
this.frequency[0] = frequency;
totalFrequency = frequency;
}
public Attribute(Nature[] nature, int[] frequency, int totalFrequency) {
this.nature = nature;
this.frequency = frequency;
this.totalFrequency = totalFrequency;
}
/**
* 使用单个词性默认词频1000构造
*
* @param nature
*/
public Attribute(Nature nature) {
this(nature, 1000);
}
public static Attribute create(String natureWithFrequency) {
try {
String[] param = natureWithFrequency.split(" ");
if (param.length % 2 != 0) {
return new Attribute(Nature.create(natureWithFrequency.trim()), 1); // 儿童锁
}
int natureCount = param.length / 2;
Attribute attribute = new Attribute(natureCount);
for (int i = 0; i < natureCount; ++i) {
attribute.nature[i] = Nature.create(param[2 * i]);
attribute.frequency[i] = Integer.parseInt(param[1 + 2 * i]);
attribute.totalFrequency += attribute.frequency[i];
}
return attribute;
} catch (Exception e) {
Predefine.logger.warning("使用字符串" + natureWithFrequency + "创建词条属性失败!"
+ TextUtility.exceptionToString(e));
return null;
}
}
/**
* 从字节流中加载
*
* @param byteArray
* @param natureIndexArray
* @return
*/
public static Attribute create(ByteArray byteArray, Nature[] natureIndexArray) {
int currentTotalFrequency = byteArray.nextInt();
int length = byteArray.nextInt();
Attribute attribute = new Attribute(length);
attribute.totalFrequency = currentTotalFrequency;
for (int j = 0; j < length; ++j) {
attribute.nature[j] = natureIndexArray[byteArray.nextInt()];
attribute.frequency[j] = byteArray.nextInt();
}
return attribute;
}
/**
* 获取词性的词频
*
* @param nature 字符串词性
* @return 词频
* @deprecated 推荐使用Nature参数
*/
public int getNatureFrequency(String nature) {
try {
Nature pos = Nature.create(nature);
return getNatureFrequency(pos);
} catch (IllegalArgumentException e) {
return 0;
}
}
/**
* 获取词性的词频
*
* @param nature 词性
* @return 词频
*/
public int getNatureFrequency(final Nature nature) {
int i = 0;
for (Nature pos : this.nature) {
if (nature == pos) {
return frequency[i];
}
++i;
}
return 0;
}
/**
* 是否有某个词性
*
* @param nature
* @return
*/
public boolean hasNature(Nature nature) {
return getNatureFrequency(nature) > 0;
}
/**
* 是否有以某个前缀开头的词性
*
* @param prefix 词性前缀比如u会查询是否有ude, uzhe等等
* @return
*/
public boolean hasNatureStartsWith(String prefix) {
for (Nature n : nature) {
if (n.startsWith(prefix)) {
return true;
}
}
return false;
}
@Override
public String toString() {
final StringBuilder sb = new StringBuilder();
for (int i = 0; i < nature.length; ++i) {
sb.append(nature[i]).append(' ').append(frequency[i]).append(' ');
}
return sb.toString();
}
public void save(DataOutputStream out) throws IOException {
out.writeInt(totalFrequency);
out.writeInt(nature.length);
for (int i = 0; i < nature.length; ++i) {
out.writeInt(nature[i].ordinal());
out.writeInt(frequency[i]);
}
}
}
/**
* 获取词语的ID
*
* @param a 词语
* @return ID, 如果不存在, 则返回-1
*/
public static int getWordID(String a) {
return CoreDictionary.trie.exactMatchSearch(a);
}
/**
* 热更新核心词典<br>
* 集群环境或其他IOAdapter需要自行删除缓存文件
*
* @return 是否成功
*/
public static boolean reload() {
String path = CoreDictionary.PATH;
IOUtil.deleteFile(path + Predefine.BIN_EXT);
return load(path);
}
}

View File

@@ -0,0 +1,342 @@
package com.hankcs.hanlp.seg;
import com.hankcs.hanlp.algorithm.Viterbi;
import com.hankcs.hanlp.collection.AhoCorasick.AhoCorasickDoubleArrayTrie;
import com.hankcs.hanlp.collection.trie.DoubleArrayTrie;
import com.hankcs.hanlp.corpus.tag.Nature;
import com.hankcs.hanlp.dictionary.CoreDictionary;
import com.hankcs.hanlp.seg.common.Term;
import com.hankcs.hanlp.dictionary.CoreDictionaryTransformMatrixDictionary;
import com.hankcs.hanlp.dictionary.other.CharType;
import com.hankcs.hanlp.seg.NShort.Path.AtomNode;
import com.hankcs.hanlp.seg.common.Graph;
import com.hankcs.hanlp.seg.common.Vertex;
import com.hankcs.hanlp.seg.common.WordNet;
import com.hankcs.hanlp.utility.TextUtility;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.ListIterator;
public abstract class WordBasedSegment extends Segment {
public WordBasedSegment() {
}
protected static void generateWord(List<Vertex> linkedArray, WordNet wordNetOptimum) {
fixResultByRule(linkedArray);
wordNetOptimum.addAll(linkedArray);
}
protected static void fixResultByRule(List<Vertex> linkedArray) {
mergeContinueNumIntoOne(linkedArray);
changeDelimiterPOS(linkedArray);
splitMiddleSlashFromDigitalWords(linkedArray);
checkDateElements(linkedArray);
}
static void changeDelimiterPOS(List<Vertex> linkedArray) {
Iterator var1 = linkedArray.iterator();
while (true) {
Vertex vertex;
do {
if (!var1.hasNext()) {
return;
}
vertex = (Vertex) var1.next();
} while (!vertex.realWord.equals("") && !vertex.realWord.equals("") && !vertex.realWord.equals("-"));
vertex.confirmNature(Nature.w);
}
}
private static void splitMiddleSlashFromDigitalWords(List<Vertex> linkedArray) {
if (linkedArray.size() >= 2) {
ListIterator<Vertex> listIterator = linkedArray.listIterator();
Vertex next = (Vertex) listIterator.next();
for (Vertex current = next; listIterator.hasNext(); current = next) {
next = (Vertex) listIterator.next();
Nature currentNature = current.getNature();
if (currentNature == Nature.nx && (next.hasNature(Nature.q) || next.hasNature(Nature.n))) {
String[] param = current.realWord.split("-", 1);
if (param.length == 2 && TextUtility.isAllNum(param[0]) && TextUtility.isAllNum(param[1])) {
current = current.copy();
current.realWord = param[0];
current.confirmNature(Nature.m);
listIterator.previous();
listIterator.previous();
listIterator.set(current);
listIterator.next();
listIterator.add(Vertex.newPunctuationInstance("-"));
listIterator.add(Vertex.newNumberInstance(param[1]));
}
}
}
}
}
private static void checkDateElements(List<Vertex> linkedArray) {
if (linkedArray.size() >= 2) {
ListIterator<Vertex> listIterator = linkedArray.listIterator();
Vertex next = (Vertex) listIterator.next();
for (Vertex current = next; listIterator.hasNext(); current = next) {
next = (Vertex) listIterator.next();
if (TextUtility.isAllNum(current.realWord) || TextUtility.isAllChineseNum(current.realWord)) {
String nextWord = next.realWord;
if (nextWord.length() == 1 && "月日时分秒".contains(nextWord)
|| nextWord.length() == 2 && nextWord.equals("月份")) {
mergeDate(listIterator, next, current);
} else if (nextWord.equals("")) {
if (TextUtility.isYearTime(current.realWord)) {
mergeDate(listIterator, next, current);
} else {
current.confirmNature(Nature.m);
}
} else if (current.realWord.endsWith("")) {
current.confirmNature(Nature.t, true);
} else {
char[] tmpCharArray = current.realWord.toCharArray();
String lastChar = String.valueOf(tmpCharArray[tmpCharArray.length - 1]);
if (!"∶·././".contains(lastChar)) {
current.confirmNature(Nature.m, true);
} else if (current.realWord.length() > 1) {
char last = current.realWord.charAt(current.realWord.length() - 1);
current = Vertex.newNumberInstance(
current.realWord.substring(0, current.realWord.length() - 1));
listIterator.previous();
listIterator.previous();
listIterator.set(current);
listIterator.next();
listIterator.add(Vertex.newPunctuationInstance(String.valueOf(last)));
}
}
}
}
}
}
private static void mergeDate(ListIterator<Vertex> listIterator, Vertex next, Vertex current) {
current = Vertex.newTimeInstance(current.realWord + next.realWord);
listIterator.previous();
listIterator.previous();
listIterator.set(current);
listIterator.next();
listIterator.next();
listIterator.remove();
}
protected static List<Term> convert(List<Vertex> vertexList) {
return Segment.convert(vertexList, false);
}
protected static Graph generateBiGraph(WordNet wordNet) {
return wordNet.toGraph();
}
/**
* @deprecated
*/
private static List<AtomNode> atomSegment(String sSentence, int start, int end) {
if (end < start) {
throw new RuntimeException("start=" + start + " < end=" + end);
} else {
List<AtomNode> atomSegment = new ArrayList();
int pCur = 0;
StringBuilder sb = new StringBuilder();
char[] charArray = sSentence.substring(start, end).toCharArray();
int[] charTypeArray = new int[charArray.length];
for (int i = 0; i < charArray.length; ++i) {
char c = charArray[i];
charTypeArray[i] = CharType.get(c);
if (c == '.' && i < charArray.length - 1 && CharType.get(charArray[i + 1]) == 9) {
charTypeArray[i] = 9;
} else if (c == '.' && i < charArray.length - 1 && charArray[i + 1] >= '0' && charArray[i + 1] <= '9') {
charTypeArray[i] = 5;
} else if (charTypeArray[i] == 8) {
charTypeArray[i] = 5;
}
}
while (true) {
while (true) {
while (pCur < charArray.length) {
int nCurType = charTypeArray[pCur];
if (nCurType != 7 && nCurType != 10 && nCurType != 6 && nCurType != 17) {
if (pCur < charArray.length - 1 && (nCurType == 5 || nCurType == 9)) {
sb.delete(0, sb.length());
sb.append(charArray[pCur]);
boolean reachEnd = true;
while (pCur < charArray.length - 1) {
++pCur;
int nNextType = charTypeArray[pCur];
if (nNextType != nCurType) {
reachEnd = false;
break;
}
sb.append(charArray[pCur]);
}
atomSegment.add(new AtomNode(sb.toString(), nCurType));
if (reachEnd) {
++pCur;
}
} else {
atomSegment.add(new AtomNode(charArray[pCur], nCurType));
++pCur;
}
} else {
String single = String.valueOf(charArray[pCur]);
if (single.length() != 0) {
atomSegment.add(new AtomNode(single, nCurType));
}
++pCur;
}
}
return atomSegment;
}
}
}
}
private static void mergeContinueNumIntoOne(List<Vertex> linkedArray) {
if (linkedArray.size() >= 2) {
ListIterator<Vertex> listIterator = linkedArray.listIterator();
Vertex next = (Vertex) listIterator.next();
Vertex current = next;
while (true) {
while (listIterator.hasNext()) {
next = (Vertex) listIterator.next();
if (!TextUtility.isAllNum(current.realWord) && !TextUtility.isAllChineseNum(current.realWord)
|| !TextUtility.isAllNum(next.realWord) && !TextUtility.isAllChineseNum(next.realWord)) {
current = next;
} else {
current = Vertex.newNumberInstance(current.realWord + next.realWord);
listIterator.previous();
listIterator.previous();
listIterator.set(current);
listIterator.next();
listIterator.next();
listIterator.remove();
}
}
return;
}
}
}
protected void generateWordNet(final WordNet wordNetStorage) {
final char[] charArray = wordNetStorage.charArray;
DoubleArrayTrie.Searcher searcher = CoreDictionary.trie.getSearcher(charArray, 0);
while (searcher.next()) {
wordNetStorage.add(searcher.begin + 1, new Vertex(new String(charArray, searcher.begin, searcher.length),
(CoreDictionary.Attribute) searcher.value, searcher.index));
}
if (this.config.forceCustomDictionary) {
this.customDictionary.parseText(charArray, new AhoCorasickDoubleArrayTrie.IHit<CoreDictionary.Attribute>() {
public void hit(int begin, int end, CoreDictionary.Attribute value) {
wordNetStorage.add(begin + 1, new Vertex(new String(charArray, begin, end - begin), value));
}
});
}
LinkedList<Vertex>[] vertexes = wordNetStorage.getVertexes();
int i = 1;
while (true) {
while (i < vertexes.length) {
if (vertexes[i].isEmpty()) {
int j;
for (j = i + 1;
j < vertexes.length - 1 && (vertexes[j].isEmpty() || CharType.get(charArray[j - 1]) == 11);
++j) {
}
wordNetStorage.add(i, Segment.quickAtomSegment(charArray, i - 1, j - 1));
i = j;
} else {
i += ((Vertex) vertexes[i].getLast()).realWord.length();
}
}
return;
}
}
protected List<Term> decorateResultForIndexMode(List<Vertex> vertexList, WordNet wordNetAll) {
List<Term> termList = new LinkedList();
int line = 1;
ListIterator<Vertex> listIterator = vertexList.listIterator();
listIterator.next();
int length = vertexList.size() - 2;
for (int i = 0; i < length; ++i) {
Vertex vertex = (Vertex) listIterator.next();
Term termMain = Segment.convert(vertex);
//termList.add(termMain);
addTerms(termList, vertex, line - 1);
termMain.offset = line - 1;
if (vertex.realWord.length() > 2) {
label43:
for (int currentLine = line; currentLine < line + vertex.realWord.length(); ++currentLine) {
Iterator iterator = wordNetAll.descendingIterator(currentLine);
while (true) {
Vertex smallVertex;
do {
if (!iterator.hasNext()) {
continue label43;
}
smallVertex = (Vertex) iterator.next();
} while ((termMain.nature != Nature.mq || !smallVertex.hasNature(Nature.q))
&& smallVertex.realWord.length() < this.config.indexMode);
if (smallVertex != vertex
&& currentLine + smallVertex.realWord.length() <= line + vertex.realWord.length()) {
listIterator.add(smallVertex);
//Term termSub = convert(smallVertex);
//termSub.offset = currentLine - 1;
//termList.add(termSub);
addTerms(termList, smallVertex, currentLine - 1);
}
}
}
}
line += vertex.realWord.length();
}
return termList;
}
protected static void speechTagging(List<Vertex> vertexList) {
Viterbi.compute(vertexList, CoreDictionaryTransformMatrixDictionary.transformMatrixDictionary);
}
protected void addTerms(List<Term> terms, Vertex vertex, int offset) {
for (int i = 0; i < vertex.attribute.nature.length; i++) {
Term term = new Term(vertex.realWord, vertex.attribute.nature[i]);
term.setFrequency(vertex.attribute.frequency[i]);
term.offset = offset;
terms.add(term);
}
}
}

View File

@@ -0,0 +1,76 @@
package com.hankcs.hanlp.seg.common;
import com.hankcs.hanlp.corpus.tag.Nature;
//import com.hankcs.hanlp.dictionary.CoreDictionary;
//import com.hankcs.hanlp.dictionary.CustomDictionary;
//import com.hankcs.hanlp.dictionary.DynamicCustomDictionary;
import lombok.Data;
import lombok.ToString;
//import static com.hankcs.hanlp.HanLP.Config.CustomDictionaryPath;
@Data
@ToString
public class Term {
public String word;
public Nature nature;
public int offset;
public int frequency = 0;
public Term(String word, Nature nature) {
this.word = word;
this.nature = nature;
}
public Term(String word, Nature nature, int offset) {
this.word = word;
this.nature = nature;
this.offset = offset;
}
public Term(String word, Nature nature, int offset, int frequency) {
this.word = word;
this.nature = nature;
this.offset = offset;
this.frequency = frequency;
}
public int length() {
return this.word.length();
}
public int getFrequency() {
if (frequency > 0) {
return frequency;
}
// todo opt
/*
String wordOri = word.toLowerCase();
CoreDictionary.Attribute attribute = getDynamicCustomDictionary().get(wordOri);
if (attribute == null) {
attribute = CoreDictionary.get(wordOri);
if (attribute == null) {
attribute = CustomDictionary.get(wordOri);
}
}
if (attribute != null && nature != null && attribute.hasNature(nature)) {
return attribute.getNatureFrequency(nature);
}
return attribute == null ? 0 : attribute.totalFrequency;
*/
return 0;
}
public boolean equals(Object obj) {
if (obj instanceof Term) {
Term term = (Term) obj;
if (this.nature == term.nature && this.word.equals(term.word)) {
return true;
}
}
return super.equals(obj);
}
}