mirror of
https://github.com/tencentmusic/supersonic.git
synced 2026-04-22 14:54:21 +08:00
(improvement)(build) Add spotless during the build process. (#1639)
This commit is contained in:
@@ -24,12 +24,13 @@ public class LoadRemoveService {
|
||||
}
|
||||
List<String> resultList = new ArrayList<>(value);
|
||||
if (StringUtils.isNotBlank(mapperRemoveNaturePrefix)) {
|
||||
resultList.removeIf(nature -> {
|
||||
if (Objects.isNull(nature)) {
|
||||
return false;
|
||||
}
|
||||
return nature.startsWith(mapperRemoveNaturePrefix);
|
||||
});
|
||||
resultList.removeIf(
|
||||
nature -> {
|
||||
if (Objects.isNull(nature)) {
|
||||
return false;
|
||||
}
|
||||
return nature.startsWith(mapperRemoveNaturePrefix);
|
||||
});
|
||||
}
|
||||
return resultList;
|
||||
}
|
||||
@@ -46,5 +47,4 @@ public class LoadRemoveService {
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -20,26 +20,16 @@ import java.util.Set;
|
||||
@Slf4j
|
||||
public abstract class BaseNode<V> implements Comparable<BaseNode> {
|
||||
|
||||
/**
|
||||
* 状态数组,方便读取的时候用
|
||||
*/
|
||||
/** 状态数组,方便读取的时候用 */
|
||||
static final Status[] ARRAY_STATUS = Status.values();
|
||||
|
||||
/**
|
||||
* 子节点
|
||||
*/
|
||||
/** 子节点 */
|
||||
protected BaseNode[] child;
|
||||
/**
|
||||
* 节点状态
|
||||
*/
|
||||
/** 节点状态 */
|
||||
protected Status status;
|
||||
/**
|
||||
* 节点代表的字符
|
||||
*/
|
||||
/** 节点代表的字符 */
|
||||
protected char c;
|
||||
/**
|
||||
* 节点代表的值
|
||||
*/
|
||||
/** 节点代表的值 */
|
||||
protected V value;
|
||||
|
||||
protected String prefix = null;
|
||||
@@ -238,25 +228,18 @@ public abstract class BaseNode<V> implements Comparable<BaseNode> {
|
||||
}
|
||||
|
||||
public enum Status {
|
||||
/**
|
||||
* 未指定,用于删除词条
|
||||
*/
|
||||
/** 未指定,用于删除词条 */
|
||||
UNDEFINED_0,
|
||||
/**
|
||||
* 不是词语的结尾
|
||||
*/
|
||||
/** 不是词语的结尾 */
|
||||
NOT_WORD_1,
|
||||
/**
|
||||
* 是个词语的结尾,并且还可以继续
|
||||
*/
|
||||
/** 是个词语的结尾,并且还可以继续 */
|
||||
WORD_MIDDLE_2,
|
||||
/**
|
||||
* 是个词语的结尾,并且没有继续
|
||||
*/
|
||||
/** 是个词语的结尾,并且没有继续 */
|
||||
WORD_END_3,
|
||||
}
|
||||
|
||||
public class TrieEntry extends AbstractMap.SimpleEntry<String, V> implements Comparable<TrieEntry> {
|
||||
public class TrieEntry extends AbstractMap.SimpleEntry<String, V>
|
||||
implements Comparable<TrieEntry> {
|
||||
|
||||
public TrieEntry(String key, V value) {
|
||||
super(key, value);
|
||||
@@ -295,8 +278,9 @@ public abstract class BaseNode<V> implements Comparable<BaseNode> {
|
||||
}
|
||||
}
|
||||
|
||||
/***
|
||||
* walk limit
|
||||
/**
|
||||
* * walk limit
|
||||
*
|
||||
* @param sb
|
||||
* @param entrySet
|
||||
*/
|
||||
@@ -322,5 +306,4 @@ public abstract class BaseNode<V> implements Comparable<BaseNode> {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
package com.hankcs.hanlp.dictionary;
|
||||
|
||||
|
||||
import com.hankcs.hanlp.HanLP;
|
||||
import com.hankcs.hanlp.collection.trie.DoubleArrayTrie;
|
||||
import com.hankcs.hanlp.corpus.io.ByteArray;
|
||||
@@ -8,6 +7,7 @@ import com.hankcs.hanlp.corpus.io.IOUtil;
|
||||
import com.hankcs.hanlp.corpus.tag.Nature;
|
||||
import com.hankcs.hanlp.utility.Predefine;
|
||||
import com.hankcs.hanlp.utility.TextUtility;
|
||||
|
||||
import java.io.BufferedOutputStream;
|
||||
import java.io.BufferedReader;
|
||||
import java.io.DataOutputStream;
|
||||
@@ -21,9 +21,7 @@ import java.util.List;
|
||||
import java.util.TreeMap;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/**
|
||||
* 使用DoubleArrayTrie实现的核心词典
|
||||
*/
|
||||
/** 使用DoubleArrayTrie实现的核心词典 */
|
||||
public class CoreDictionary {
|
||||
|
||||
public static DoubleArrayTrie<Attribute> trie = new DoubleArrayTrie<Attribute>();
|
||||
@@ -36,8 +34,13 @@ public class CoreDictionary {
|
||||
if (!load(PATH)) {
|
||||
throw new IllegalArgumentException("核心词典" + PATH + "加载失败");
|
||||
} else {
|
||||
Predefine.logger.info(PATH + "加载成功," + trie.size() + "个词条,耗时"
|
||||
+ (System.currentTimeMillis() - start) + "ms");
|
||||
Predefine.logger.info(
|
||||
PATH
|
||||
+ "加载成功,"
|
||||
+ trie.size()
|
||||
+ "个词条,耗时"
|
||||
+ (System.currentTimeMillis() - start)
|
||||
+ "ms");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -75,15 +78,21 @@ public class CoreDictionary {
|
||||
totalFrequency += attribute.totalFrequency;
|
||||
}
|
||||
Predefine.logger.info(
|
||||
"核心词典读入词条" + map.size() + " 全部频次" + totalFrequency + ",耗时" + (
|
||||
System.currentTimeMillis() - start)
|
||||
"核心词典读入词条"
|
||||
+ map.size()
|
||||
+ " 全部频次"
|
||||
+ totalFrequency
|
||||
+ ",耗时"
|
||||
+ (System.currentTimeMillis() - start)
|
||||
+ "ms");
|
||||
br.close();
|
||||
trie.build(map);
|
||||
Predefine.logger.info("核心词典加载成功:" + trie.size() + "个词条,下面将写入缓存……");
|
||||
try {
|
||||
DataOutputStream out = new DataOutputStream(
|
||||
new BufferedOutputStream(IOUtil.newOutputStream(path + Predefine.BIN_EXT)));
|
||||
DataOutputStream out =
|
||||
new DataOutputStream(
|
||||
new BufferedOutputStream(
|
||||
IOUtil.newOutputStream(path + Predefine.BIN_EXT)));
|
||||
Collection<Attribute> attributeList = map.values();
|
||||
out.writeInt(attributeList.size());
|
||||
for (Attribute attribute : attributeList) {
|
||||
@@ -202,25 +211,18 @@ public class CoreDictionary {
|
||||
return trie.get(key) != null;
|
||||
}
|
||||
|
||||
/**
|
||||
* 核心词典中的词属性
|
||||
*/
|
||||
/** 核心词典中的词属性 */
|
||||
public static class Attribute implements Serializable {
|
||||
|
||||
/**
|
||||
* 词性列表
|
||||
*/
|
||||
/** 词性列表 */
|
||||
public Nature[] nature;
|
||||
/**
|
||||
* 词性对应的词频
|
||||
*/
|
||||
/** 词性对应的词频 */
|
||||
public int[] frequency;
|
||||
|
||||
public int totalFrequency;
|
||||
public String[] originals;
|
||||
public String original = null;
|
||||
|
||||
|
||||
public Attribute(int size) {
|
||||
nature = new Nature[size];
|
||||
frequency = new int[size];
|
||||
@@ -276,8 +278,11 @@ public class CoreDictionary {
|
||||
}
|
||||
return attribute;
|
||||
} catch (Exception e) {
|
||||
Predefine.logger.warning("使用字符串" + natureWithFrequency + "创建词条属性失败!"
|
||||
+ TextUtility.exceptionToString(e));
|
||||
Predefine.logger.warning(
|
||||
"使用字符串"
|
||||
+ natureWithFrequency
|
||||
+ "创建词条属性失败!"
|
||||
+ TextUtility.exceptionToString(e));
|
||||
return null;
|
||||
}
|
||||
}
|
||||
@@ -404,7 +409,10 @@ public class CoreDictionary {
|
||||
if (originals == null || originals.length == 0) {
|
||||
return null;
|
||||
}
|
||||
return Arrays.stream(originals).filter(o -> o != null).distinct().collect(Collectors.toList());
|
||||
return Arrays.stream(originals)
|
||||
.filter(o -> o != null)
|
||||
.distinct()
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -431,4 +439,3 @@ public class CoreDictionary {
|
||||
return load(path);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,16 +1,15 @@
|
||||
package com.hankcs.hanlp.seg;
|
||||
|
||||
|
||||
import com.hankcs.hanlp.algorithm.Viterbi;
|
||||
import com.hankcs.hanlp.collection.AhoCorasick.AhoCorasickDoubleArrayTrie;
|
||||
import com.hankcs.hanlp.collection.trie.DoubleArrayTrie;
|
||||
import com.hankcs.hanlp.corpus.tag.Nature;
|
||||
import com.hankcs.hanlp.dictionary.CoreDictionary;
|
||||
import com.hankcs.hanlp.seg.common.Term;
|
||||
import com.hankcs.hanlp.dictionary.CoreDictionaryTransformMatrixDictionary;
|
||||
import com.hankcs.hanlp.dictionary.other.CharType;
|
||||
import com.hankcs.hanlp.seg.NShort.Path.AtomNode;
|
||||
import com.hankcs.hanlp.seg.common.Graph;
|
||||
import com.hankcs.hanlp.seg.common.Term;
|
||||
import com.hankcs.hanlp.seg.common.Vertex;
|
||||
import com.hankcs.hanlp.seg.common.WordNet;
|
||||
import com.hankcs.hanlp.utility.TextUtility;
|
||||
@@ -21,11 +20,9 @@ import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.ListIterator;
|
||||
|
||||
|
||||
public abstract class WordBasedSegment extends Segment {
|
||||
|
||||
public WordBasedSegment() {
|
||||
}
|
||||
public WordBasedSegment() {}
|
||||
|
||||
protected static void generateWord(List<Vertex> linkedArray, WordNet wordNetOptimum) {
|
||||
fixResultByRule(linkedArray);
|
||||
@@ -50,7 +47,9 @@ public abstract class WordBasedSegment extends Segment {
|
||||
}
|
||||
|
||||
vertex = (Vertex) var1.next();
|
||||
} while (!vertex.realWord.equals("--") && !vertex.realWord.equals("—") && !vertex.realWord.equals("-"));
|
||||
} while (!vertex.realWord.equals("--")
|
||||
&& !vertex.realWord.equals("—")
|
||||
&& !vertex.realWord.equals("-"));
|
||||
|
||||
vertex.confirmNature(Nature.w);
|
||||
}
|
||||
@@ -64,9 +63,12 @@ public abstract class WordBasedSegment extends Segment {
|
||||
for (Vertex current = next; listIterator.hasNext(); current = next) {
|
||||
next = (Vertex) listIterator.next();
|
||||
Nature currentNature = current.getNature();
|
||||
if (currentNature == Nature.nx && (next.hasNature(Nature.q) || next.hasNature(Nature.n))) {
|
||||
if (currentNature == Nature.nx
|
||||
&& (next.hasNature(Nature.q) || next.hasNature(Nature.n))) {
|
||||
String[] param = current.realWord.split("-", 1);
|
||||
if (param.length == 2 && TextUtility.isAllNum(param[0]) && TextUtility.isAllNum(param[1])) {
|
||||
if (param.length == 2
|
||||
&& TextUtility.isAllNum(param[0])
|
||||
&& TextUtility.isAllNum(param[1])) {
|
||||
current = current.copy();
|
||||
current.realWord = param[0];
|
||||
current.confirmNature(Nature.m);
|
||||
@@ -79,7 +81,6 @@ public abstract class WordBasedSegment extends Segment {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@@ -90,7 +91,8 @@ public abstract class WordBasedSegment extends Segment {
|
||||
|
||||
for (Vertex current = next; listIterator.hasNext(); current = next) {
|
||||
next = (Vertex) listIterator.next();
|
||||
if (TextUtility.isAllNum(current.realWord) || TextUtility.isAllChineseNum(current.realWord)) {
|
||||
if (TextUtility.isAllNum(current.realWord)
|
||||
|| TextUtility.isAllChineseNum(current.realWord)) {
|
||||
String nextWord = next.realWord;
|
||||
if (nextWord.length() == 1 && "月日时分秒".contains(nextWord)
|
||||
|| nextWord.length() == 2 && nextWord.equals("月份")) {
|
||||
@@ -110,8 +112,10 @@ public abstract class WordBasedSegment extends Segment {
|
||||
current.confirmNature(Nature.m, true);
|
||||
} else if (current.realWord.length() > 1) {
|
||||
char last = current.realWord.charAt(current.realWord.length() - 1);
|
||||
current = Vertex.newNumberInstance(
|
||||
current.realWord.substring(0, current.realWord.length() - 1));
|
||||
current =
|
||||
Vertex.newNumberInstance(
|
||||
current.realWord.substring(
|
||||
0, current.realWord.length() - 1));
|
||||
listIterator.previous();
|
||||
listIterator.previous();
|
||||
listIterator.set(current);
|
||||
@@ -121,7 +125,6 @@ public abstract class WordBasedSegment extends Segment {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@@ -143,9 +146,7 @@ public abstract class WordBasedSegment extends Segment {
|
||||
return wordNet.toGraph();
|
||||
}
|
||||
|
||||
/**
|
||||
* @deprecated
|
||||
*/
|
||||
/** @deprecated */
|
||||
private static List<AtomNode> atomSegment(String sSentence, int start, int end) {
|
||||
if (end < start) {
|
||||
throw new RuntimeException("start=" + start + " < end=" + end);
|
||||
@@ -161,7 +162,10 @@ public abstract class WordBasedSegment extends Segment {
|
||||
charTypeArray[i] = CharType.get(c);
|
||||
if (c == '.' && i < charArray.length - 1 && CharType.get(charArray[i + 1]) == 9) {
|
||||
charTypeArray[i] = 9;
|
||||
} else if (c == '.' && i < charArray.length - 1 && charArray[i + 1] >= '0' && charArray[i + 1] <= '9') {
|
||||
} else if (c == '.'
|
||||
&& i < charArray.length - 1
|
||||
&& charArray[i + 1] >= '0'
|
||||
&& charArray[i + 1] <= '9') {
|
||||
charTypeArray[i] = 5;
|
||||
} else if (charTypeArray[i] == 8) {
|
||||
charTypeArray[i] = 5;
|
||||
@@ -222,8 +226,10 @@ public abstract class WordBasedSegment extends Segment {
|
||||
while (true) {
|
||||
while (listIterator.hasNext()) {
|
||||
next = (Vertex) listIterator.next();
|
||||
if (!TextUtility.isAllNum(current.realWord) && !TextUtility.isAllChineseNum(current.realWord)
|
||||
|| !TextUtility.isAllNum(next.realWord) && !TextUtility.isAllChineseNum(next.realWord)) {
|
||||
if (!TextUtility.isAllNum(current.realWord)
|
||||
&& !TextUtility.isAllChineseNum(current.realWord)
|
||||
|| !TextUtility.isAllNum(next.realWord)
|
||||
&& !TextUtility.isAllChineseNum(next.realWord)) {
|
||||
current = next;
|
||||
} else {
|
||||
current = Vertex.newNumberInstance(current.realWord + next.realWord);
|
||||
@@ -246,16 +252,24 @@ public abstract class WordBasedSegment extends Segment {
|
||||
DoubleArrayTrie.Searcher searcher = CoreDictionary.trie.getSearcher(charArray, 0);
|
||||
|
||||
while (searcher.next()) {
|
||||
wordNetStorage.add(searcher.begin + 1, new Vertex(new String(charArray, searcher.begin, searcher.length),
|
||||
(CoreDictionary.Attribute) searcher.value, searcher.index));
|
||||
wordNetStorage.add(
|
||||
searcher.begin + 1,
|
||||
new Vertex(
|
||||
new String(charArray, searcher.begin, searcher.length),
|
||||
(CoreDictionary.Attribute) searcher.value,
|
||||
searcher.index));
|
||||
}
|
||||
|
||||
if (this.config.forceCustomDictionary) {
|
||||
this.customDictionary.parseText(charArray, new AhoCorasickDoubleArrayTrie.IHit<CoreDictionary.Attribute>() {
|
||||
public void hit(int begin, int end, CoreDictionary.Attribute value) {
|
||||
wordNetStorage.add(begin + 1, new Vertex(new String(charArray, begin, end - begin), value));
|
||||
}
|
||||
});
|
||||
this.customDictionary.parseText(
|
||||
charArray,
|
||||
new AhoCorasickDoubleArrayTrie.IHit<CoreDictionary.Attribute>() {
|
||||
public void hit(int begin, int end, CoreDictionary.Attribute value) {
|
||||
wordNetStorage.add(
|
||||
begin + 1,
|
||||
new Vertex(new String(charArray, begin, end - begin), value));
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
LinkedList<Vertex>[] vertexes = wordNetStorage.getVertexes();
|
||||
@@ -266,9 +280,10 @@ public abstract class WordBasedSegment extends Segment {
|
||||
if (vertexes[i].isEmpty()) {
|
||||
int j;
|
||||
for (j = i + 1;
|
||||
j < vertexes.length - 1 && (vertexes[j].isEmpty() || CharType.get(charArray[j - 1]) == 11);
|
||||
++j) {
|
||||
}
|
||||
j < vertexes.length - 1
|
||||
&& (vertexes[j].isEmpty()
|
||||
|| CharType.get(charArray[j - 1]) == 11);
|
||||
++j) {}
|
||||
|
||||
wordNetStorage.add(i, Segment.quickAtomSegment(charArray, i - 1, j - 1));
|
||||
i = j;
|
||||
@@ -291,12 +306,14 @@ public abstract class WordBasedSegment extends Segment {
|
||||
for (int i = 0; i < length; ++i) {
|
||||
Vertex vertex = (Vertex) listIterator.next();
|
||||
Term termMain = Segment.convert(vertex);
|
||||
//termList.add(termMain);
|
||||
// termList.add(termMain);
|
||||
addTerms(termList, vertex, line - 1);
|
||||
termMain.offset = line - 1;
|
||||
if (vertex.realWord.length() > 2) {
|
||||
label43:
|
||||
for (int currentLine = line; currentLine < line + vertex.realWord.length(); ++currentLine) {
|
||||
for (int currentLine = line;
|
||||
currentLine < line + vertex.realWord.length();
|
||||
++currentLine) {
|
||||
Iterator iterator = wordNetAll.descendingIterator(currentLine);
|
||||
|
||||
while (true) {
|
||||
@@ -310,11 +327,12 @@ public abstract class WordBasedSegment extends Segment {
|
||||
&& smallVertex.realWord.length() < this.config.indexMode);
|
||||
|
||||
if (smallVertex != vertex
|
||||
&& currentLine + smallVertex.realWord.length() <= line + vertex.realWord.length()) {
|
||||
&& currentLine + smallVertex.realWord.length()
|
||||
<= line + vertex.realWord.length()) {
|
||||
listIterator.add(smallVertex);
|
||||
//Term termSub = convert(smallVertex);
|
||||
//termSub.offset = currentLine - 1;
|
||||
//termList.add(termSub);
|
||||
// Term termSub = convert(smallVertex);
|
||||
// termSub.offset = currentLine - 1;
|
||||
// termList.add(termSub);
|
||||
addTerms(termList, smallVertex, currentLine - 1);
|
||||
}
|
||||
}
|
||||
@@ -328,7 +346,8 @@ public abstract class WordBasedSegment extends Segment {
|
||||
}
|
||||
|
||||
protected static void speechTagging(List<Vertex> vertexList) {
|
||||
Viterbi.compute(vertexList, CoreDictionaryTransformMatrixDictionary.transformMatrixDictionary);
|
||||
Viterbi.compute(
|
||||
vertexList, CoreDictionaryTransformMatrixDictionary.transformMatrixDictionary);
|
||||
}
|
||||
|
||||
protected void addTerms(List<Term> terms, Vertex vertex, int offset) {
|
||||
|
||||
@@ -1,14 +1,9 @@
|
||||
package com.hankcs.hanlp.seg.common;
|
||||
|
||||
import com.hankcs.hanlp.corpus.tag.Nature;
|
||||
//import com.hankcs.hanlp.dictionary.CoreDictionary;
|
||||
//import com.hankcs.hanlp.dictionary.CustomDictionary;
|
||||
//import com.hankcs.hanlp.dictionary.DynamicCustomDictionary;
|
||||
import lombok.Data;
|
||||
import lombok.ToString;
|
||||
|
||||
//import static com.hankcs.hanlp.HanLP.Config.CustomDictionaryPath;
|
||||
|
||||
@Data
|
||||
@ToString
|
||||
public class Term {
|
||||
@@ -72,5 +67,4 @@ public class Term {
|
||||
}
|
||||
return super.equals(obj);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user