headless integrates knowledge (#722)

2025-12-11 12:07:42 +00:00 · 2024-02-05 20:30:57 +08:00
parent 74d0ec2b23
commit 9600456bae
174 changed files with 1908 additions and 1817 deletions
--- a/common/pom.xml
+++ b/common/pom.xml
@@ -190,6 +190,11 @@
            <groupId>dev.langchain4j</groupId>
            <artifactId>langchain4j-embeddings</artifactId>
        </dependency>
+        <dependency>
+            <groupId>com.hankcs</groupId>
+            <artifactId>hanlp</artifactId>
+            <version>${hanlp.version}</version>
+        </dependency>

    </dependencies>

--- a/common/src/main/java/com/hankcs/hanlp/LoadRemoveService.java
+++ b/common/src/main/java/com/hankcs/hanlp/LoadRemoveService.java
@@ -0,0 +1,63 @@
+package com.hankcs.hanlp;
+
+import com.tencent.supersonic.common.pojo.enums.DictWordType;
+import lombok.Data;
+import lombok.extern.slf4j.Slf4j;
+import org.apache.commons.lang3.StringUtils;
+import org.springframework.beans.factory.annotation.Value;
+import org.springframework.util.CollectionUtils;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Objects;
+import java.util.Set;
+
+@Data
+@Slf4j
+public class LoadRemoveService {
+
+    @Value("${mapper.remove.nature.prefix:}")
+    private String mapperRemoveNaturePrefix;
+
+    public List removeNatures(List value, Set<Long> detectModelIds) {
+        if (CollectionUtils.isEmpty(value)) {
+            return value;
+        }
+        List<String> resultList = new ArrayList<>(value);
+        if (!CollectionUtils.isEmpty(detectModelIds)) {
+            resultList.removeIf(nature -> {
+                if (Objects.isNull(nature)) {
+                    return false;
+                }
+                Long modelId = getViewId(nature);
+                if (Objects.nonNull(modelId)) {
+                    return !detectModelIds.contains(modelId);
+                }
+                return false;
+            });
+        }
+        if (StringUtils.isNotBlank(mapperRemoveNaturePrefix)) {
+            resultList.removeIf(nature -> {
+                if (Objects.isNull(nature)) {
+                    return false;
+                }
+                return nature.startsWith(mapperRemoveNaturePrefix);
+            });
+        }
+        return resultList;
+    }
+
+    public Long getViewId(String nature) {
+        try {
+            String[] split = nature.split(DictWordType.NATURE_SPILT);
+            if (split.length <= 1) {
+                return null;
+            }
+            return Long.valueOf(split[1]);
+        } catch (NumberFormatException e) {
+            log.error("", e);
+        }
+        return null;
+    }
+
+}
--- a/common/src/main/java/com/hankcs/hanlp/collection/trie/bintrie/BaseNode.java
+++ b/common/src/main/java/com/hankcs/hanlp/collection/trie/bintrie/BaseNode.java
@@ -0,0 +1,331 @@
+package com.hankcs.hanlp.collection.trie.bintrie;
+
+import com.hankcs.hanlp.LoadRemoveService;
+import com.hankcs.hanlp.corpus.io.ByteArray;
+import java.io.DataOutputStream;
+import java.io.IOException;
+import java.io.ObjectInput;
+import java.io.ObjectOutput;
+import java.util.AbstractMap;
+import java.util.ArrayDeque;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Map;
+import java.util.Objects;
+import java.util.Queue;
+import java.util.Set;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+
+public abstract class BaseNode<V> implements Comparable<BaseNode> {
+
+    /**
+     * 状态数组，方便读取的时候用
+     */
+    static final Status[] ARRAY_STATUS = Status.values();
+
+    private static final Logger logger = LoggerFactory.getLogger(BaseNode.class);
+    /**
+     * 子节点
+     */
+    protected BaseNode[] child;
+    /**
+     * 节点状态
+     */
+    protected Status status;
+    /**
+     * 节点代表的字符
+     */
+    protected char c;
+    /**
+     * 节点代表的值
+     */
+    protected V value;
+
+    protected String prefix = null;
+
+    public BaseNode<V> transition(String path, int begin) {
+        BaseNode<V> cur = this;
+        for (int i = begin; i < path.length(); ++i) {
+            cur = cur.getChild(path.charAt(i));
+            if (cur == null || cur.status == Status.UNDEFINED_0) {
+                return null;
+            }
+        }
+        return cur;
+    }
+
+    public BaseNode<V> transition(char[] path, int begin) {
+        BaseNode<V> cur = this;
+        for (int i = begin; i < path.length; ++i) {
+            cur = cur.getChild(path[i]);
+            if (cur == null || cur.status == Status.UNDEFINED_0) {
+                return null;
+            }
+        }
+        return cur;
+    }
+
+    /**
+     * 转移状态
+     *
+     * @param path
+     * @return
+     */
+    public BaseNode<V> transition(char path) {
+        BaseNode<V> cur = this;
+        cur = cur.getChild(path);
+        if (cur == null || cur.status == Status.UNDEFINED_0) {
+            return null;
+        }
+        return cur;
+    }
+
+    /**
+     * 添加子节点
+     *
+     * @return true-新增了节点 false-修改了现有节点
+     */
+    protected abstract boolean addChild(BaseNode node);
+
+    /**
+     * 是否含有子节点
+     *
+     * @param c 子节点的char
+     * @return 是否含有
+     */
+    protected boolean hasChild(char c) {
+        return getChild(c) != null;
+    }
+
+    protected char getChar() {
+        return c;
+    }
+
+    /**
+     * 获取子节点
+     *
+     * @param c 子节点的char
+     * @return 子节点
+     */
+    public abstract BaseNode getChild(char c);
+
+    /**
+     * 获取节点对应的值
+     *
+     * @return 值
+     */
+    public final V getValue() {
+        return value;
+    }
+
+    /**
+     * 设置节点对应的值
+     *
+     * @param value 值
+     */
+    public final void setValue(V value) {
+        this.value = value;
+    }
+
+    @Override
+    public int compareTo(BaseNode other) {
+        return compareTo(other.getChar());
+    }
+
+    /**
+     * 重载，与字符的比较
+     *
+     * @param other
+     * @return
+     */
+    public int compareTo(char other) {
+        if (this.c > other) {
+            return 1;
+        }
+        if (this.c < other) {
+            return -1;
+        }
+        return 0;
+    }
+
+    /**
+     * 获取节点的成词状态
+     *
+     * @return
+     */
+    public Status getStatus() {
+        return status;
+    }
+
+    protected void walk(StringBuilder sb, Set<Map.Entry<String, V>> entrySet) {
+        sb.append(c);
+        if (status == Status.WORD_MIDDLE_2 || status == Status.WORD_END_3) {
+            entrySet.add(new TrieEntry(sb.toString(), value));
+        }
+        if (child == null) {
+            return;
+        }
+        for (BaseNode node : child) {
+            if (node == null) {
+                continue;
+            }
+            node.walk(new StringBuilder(sb.toString()), entrySet);
+        }
+    }
+
+    protected void walkToSave(DataOutputStream out) throws IOException {
+        out.writeChar(c);
+        out.writeInt(status.ordinal());
+        int childSize = 0;
+        if (child != null) {
+            childSize = child.length;
+        }
+        out.writeInt(childSize);
+        if (child == null) {
+            return;
+        }
+        for (BaseNode node : child) {
+            node.walkToSave(out);
+        }
+    }
+
+    protected void walkToSave(ObjectOutput out) throws IOException {
+        out.writeChar(c);
+        out.writeInt(status.ordinal());
+        if (status == Status.WORD_END_3 || status == Status.WORD_MIDDLE_2) {
+            out.writeObject(value);
+        }
+        int childSize = 0;
+        if (child != null) {
+            childSize = child.length;
+        }
+        out.writeInt(childSize);
+        if (child == null) {
+            return;
+        }
+        for (BaseNode node : child) {
+            node.walkToSave(out);
+        }
+    }
+
+    protected void walkToLoad(ByteArray byteArray, _ValueArray<V> valueArray) {
+        c = byteArray.nextChar();
+        status = ARRAY_STATUS[byteArray.nextInt()];
+        if (status == Status.WORD_END_3 || status == Status.WORD_MIDDLE_2) {
+            value = valueArray.nextValue();
+        }
+        int childSize = byteArray.nextInt();
+        child = new BaseNode[childSize];
+        for (int i = 0; i < childSize; ++i) {
+            child[i] = new Node<V>();
+            child[i].walkToLoad(byteArray, valueArray);
+        }
+    }
+
+    protected void walkToLoad(ObjectInput byteArray) throws IOException, ClassNotFoundException {
+        c = byteArray.readChar();
+        status = ARRAY_STATUS[byteArray.readInt()];
+        if (status == Status.WORD_END_3 || status == Status.WORD_MIDDLE_2) {
+            value = (V) byteArray.readObject();
+        }
+        int childSize = byteArray.readInt();
+        child = new BaseNode[childSize];
+        for (int i = 0; i < childSize; ++i) {
+            child[i] = new Node<V>();
+            child[i].walkToLoad(byteArray);
+        }
+    }
+
+    public enum Status {
+        /**
+         * 未指定，用于删除词条
+         */
+        UNDEFINED_0,
+        /**
+         * 不是词语的结尾
+         */
+        NOT_WORD_1,
+        /**
+         * 是个词语的结尾，并且还可以继续
+         */
+        WORD_MIDDLE_2,
+        /**
+         * 是个词语的结尾，并且没有继续
+         */
+        WORD_END_3,
+    }
+
+    public class TrieEntry extends AbstractMap.SimpleEntry<String, V> implements Comparable<TrieEntry> {
+
+        public TrieEntry(String key, V value) {
+            super(key, value);
+        }
+
+        @Override
+        public int compareTo(TrieEntry o) {
+            return getKey().compareTo(String.valueOf(o.getKey()));
+        }
+    }
+
+    @Override
+    public String toString() {
+        return "BaseNode{"
+                + "child="
+                + Arrays.toString(child)
+                + ", status="
+                + status
+                + ", c="
+                + c
+                + ", value="
+                + value
+                + ", prefix='"
+                + prefix
+                + '\''
+                + '}';
+    }
+
+    public void walkNode(Set<Map.Entry<String, V>> entrySet, Set<Long> detectModelIds) {
+        if (status == Status.WORD_MIDDLE_2 || status == Status.WORD_END_3) {
+            logger.debug("detectModelIds:{},before:{}", detectModelIds, value.toString());
+            List natures = new LoadRemoveService().removeNatures((List) value, detectModelIds);
+            String name = this.prefix != null ? this.prefix + c : "" + c;
+            logger.debug("name:{},after:{},natures:{}", name, (List) value, natures);
+            entrySet.add(new TrieEntry(name, (V) natures));
+        }
+    }
+
+    /***
+     * walk limit
+     * @param sb
+     * @param entrySet
+     * @param limit
+     */
+    public void walkLimit(StringBuilder sb, Set<Map.Entry<String, V>> entrySet, int limit, Set<Long> detectModelIds) {
+        Queue<BaseNode> queue = new ArrayDeque<>();
+        this.prefix = sb.toString();
+        queue.add(this);
+        while (!queue.isEmpty()) {
+            if (entrySet.size() >= limit) {
+                break;
+            }
+            BaseNode root = queue.poll();
+            if (root == null) {
+                continue;
+            }
+            root.walkNode(entrySet, detectModelIds);
+            if (root.child == null) {
+                continue;
+            }
+            String prefix = root.prefix + root.c;
+            for (BaseNode node : root.child) {
+                if (Objects.nonNull(node)) {
+                    node.prefix = prefix;
+                    queue.add(node);
+                }
+            }
+        }
+    }
+
+}
--- a/common/src/main/java/com/hankcs/hanlp/dictionary/CoreDictionary.java
+++ b/common/src/main/java/com/hankcs/hanlp/dictionary/CoreDictionary.java
@@ -0,0 +1,393 @@
+package com.hankcs.hanlp.dictionary;
+
+
+import com.hankcs.hanlp.HanLP;
+import com.hankcs.hanlp.collection.trie.DoubleArrayTrie;
+import com.hankcs.hanlp.corpus.io.ByteArray;
+import com.hankcs.hanlp.corpus.io.IOUtil;
+import com.hankcs.hanlp.corpus.tag.Nature;
+import com.hankcs.hanlp.utility.Predefine;
+import com.hankcs.hanlp.utility.TextUtility;
+
+import java.io.BufferedOutputStream;
+import java.io.BufferedReader;
+import java.io.DataOutputStream;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.Serializable;
+import java.util.Collection;
+import java.util.TreeMap;
+
+/**
+ * 使用DoubleArrayTrie实现的核心词典
+ */
+public class CoreDictionary {
+
+    public static DoubleArrayTrie<Attribute> trie = new DoubleArrayTrie<Attribute>();
+
+    public static final String PATH = HanLP.Config.CoreDictionaryPath;
+
+    // 自动加载词典
+    static {
+        long start = System.currentTimeMillis();
+        if (!load(PATH)) {
+            throw new IllegalArgumentException("核心词典" + PATH + "加载失败");
+        } else {
+            Predefine.logger.info(PATH + "加载成功，" + trie.size() + "个词条，耗时"
+                    + (System.currentTimeMillis() - start) + "ms");
+        }
+    }
+
+    // 一些特殊的WORD_ID
+    public static final int NR_WORD_ID = getWordID(Predefine.TAG_PEOPLE);
+    public static final int NS_WORD_ID = getWordID(Predefine.TAG_PLACE);
+    public static final int NT_WORD_ID = getWordID(Predefine.TAG_GROUP);
+    public static final int T_WORD_ID = getWordID(Predefine.TAG_TIME);
+    public static final int X_WORD_ID = getWordID(Predefine.TAG_CLUSTER);
+    public static final int M_WORD_ID = getWordID(Predefine.TAG_NUMBER);
+    public static final int NX_WORD_ID = getWordID(Predefine.TAG_PROPER);
+
+    private static boolean load(String path) {
+        Predefine.logger.info("核心词典开始加载:" + path);
+        if (loadDat(path)) {
+            return true;
+        }
+        TreeMap<String, Attribute> map = new TreeMap<String, Attribute>();
+        BufferedReader br = null;
+        try {
+            br = new BufferedReader(new InputStreamReader(IOUtil.newInputStream(path), "UTF-8"));
+            String line;
+            int totalFrequency = 0;
+            long start = System.currentTimeMillis();
+            while ((line = br.readLine()) != null) {
+                String[] param = line.split("\\s");
+                int natureCount = (param.length - 1) / 2;
+                Attribute attribute = new Attribute(natureCount);
+                for (int i = 0; i < natureCount; ++i) {
+                    attribute.nature[i] = Nature.create(param[1 + 2 * i]);
+                    attribute.frequency[i] = Integer.parseInt(param[2 + 2 * i]);
+                    attribute.totalFrequency += attribute.frequency[i];
+                }
+                map.put(param[0], attribute);
+                totalFrequency += attribute.totalFrequency;
+            }
+            Predefine.logger.info(
+                    "核心词典读入词条" + map.size() + " 全部频次" + totalFrequency + "，耗时" + (System.currentTimeMillis() - start)
+                            + "ms");
+            br.close();
+            trie.build(map);
+            Predefine.logger.info("核心词典加载成功:" + trie.size() + "个词条，下面将写入缓存……");
+            try {
+                DataOutputStream out = new DataOutputStream(
+                        new BufferedOutputStream(IOUtil.newOutputStream(path + Predefine.BIN_EXT)));
+                Collection<Attribute> attributeList = map.values();
+                out.writeInt(attributeList.size());
+                for (Attribute attribute : attributeList) {
+                    out.writeInt(attribute.totalFrequency);
+                    out.writeInt(attribute.nature.length);
+                    for (int i = 0; i < attribute.nature.length; ++i) {
+                        out.writeInt(attribute.nature[i].ordinal());
+                        out.writeInt(attribute.frequency[i]);
+                    }
+                }
+                trie.save(out);
+                out.writeInt(totalFrequency);
+                Predefine.setTotalFrequency(totalFrequency);
+                out.close();
+            } catch (Exception e) {
+                Predefine.logger.warning("保存失败" + e);
+                return false;
+            }
+        } catch (FileNotFoundException e) {
+            Predefine.logger.warning("核心词典" + path + "不存在！" + e);
+            return false;
+        } catch (IOException e) {
+            Predefine.logger.warning("核心词典" + path + "读取错误！" + e);
+            return false;
+        }
+
+        return true;
+    }
+
+    /**
+     * 从磁盘加载双数组
+     *
+     * @param path
+     * @return
+     */
+    static boolean loadDat(String path) {
+        try {
+            ByteArray byteArray = ByteArray.createByteArray(path + Predefine.BIN_EXT);
+            if (byteArray == null) {
+                return false;
+            }
+            int size = byteArray.nextInt();
+            Attribute[] attributes = new Attribute[size];
+            final Nature[] natureIndexArray = Nature.values();
+            for (int i = 0; i < size; ++i) {
+                // 第一个是全部频次，第二个是词性个数
+                int currentTotalFrequency = byteArray.nextInt();
+                int length = byteArray.nextInt();
+                attributes[i] = new Attribute(length);
+                attributes[i].totalFrequency = currentTotalFrequency;
+                for (int j = 0; j < length; ++j) {
+                    attributes[i].nature[j] = natureIndexArray[byteArray.nextInt()];
+                    attributes[i].frequency[j] = byteArray.nextInt();
+                }
+            }
+            if (!trie.load(byteArray, attributes)) {
+                return false;
+            }
+            int totalFrequency = 0;
+            if (byteArray.hasMore()) {
+                totalFrequency = byteArray.nextInt();
+            } else {
+                for (Attribute attribute : attributes) {
+                    totalFrequency += attribute.totalFrequency;
+                }
+            }
+            Predefine.setTotalFrequency(totalFrequency);
+        } catch (Exception e) {
+            Predefine.logger.warning("读取失败，问题发生在" + e);
+            return false;
+        }
+        return true;
+    }
+
+    /**
+     * 获取条目
+     *
+     * @param key
+     * @return
+     */
+    public static Attribute get(String key) {
+        return trie.get(key);
+    }
+
+    /**
+     * 获取条目
+     *
+     * @param wordID
+     * @return
+     */
+    public static Attribute get(int wordID) {
+        return trie.get(wordID);
+    }
+
+    /**
+     * 获取词频
+     *
+     * @param term
+     * @return
+     */
+    public static int getTermFrequency(String term) {
+        Attribute attribute = get(term);
+        if (attribute == null) {
+            return 0;
+        }
+        return attribute.totalFrequency;
+    }
+
+    /**
+     * 是否包含词语
+     *
+     * @param key
+     * @return
+     */
+    public static boolean contains(String key) {
+        return trie.get(key) != null;
+    }
+
+    /**
+     * 核心词典中的词属性
+     */
+    public static class Attribute implements Serializable {
+
+        /**
+         * 词性列表
+         */
+        public Nature[] nature;
+        /**
+         * 词性对应的词频
+         */
+        public int[] frequency;
+
+        public int totalFrequency;
+        public String original = null;
+
+
+        public Attribute(int size) {
+            nature = new Nature[size];
+            frequency = new int[size];
+        }
+
+        public Attribute(Nature[] nature, int[] frequency) {
+            this.nature = nature;
+            this.frequency = frequency;
+        }
+
+        public Attribute(Nature nature, int frequency) {
+            this(1);
+            this.nature[0] = nature;
+            this.frequency[0] = frequency;
+            totalFrequency = frequency;
+        }
+
+        public Attribute(Nature[] nature, int[] frequency, int totalFrequency) {
+            this.nature = nature;
+            this.frequency = frequency;
+            this.totalFrequency = totalFrequency;
+        }
+
+        /**
+         * 使用单个词性，默认词频1000构造
+         *
+         * @param nature
+         */
+        public Attribute(Nature nature) {
+            this(nature, 1000);
+        }
+
+        public static Attribute create(String natureWithFrequency) {
+            try {
+                String[] param = natureWithFrequency.split(" ");
+                if (param.length % 2 != 0) {
+                    return new Attribute(Nature.create(natureWithFrequency.trim()), 1); // 儿童锁
+                }
+                int natureCount = param.length / 2;
+                Attribute attribute = new Attribute(natureCount);
+                for (int i = 0; i < natureCount; ++i) {
+                    attribute.nature[i] = Nature.create(param[2 * i]);
+                    attribute.frequency[i] = Integer.parseInt(param[1 + 2 * i]);
+                    attribute.totalFrequency += attribute.frequency[i];
+                }
+                return attribute;
+            } catch (Exception e) {
+                Predefine.logger.warning("使用字符串" + natureWithFrequency + "创建词条属性失败！"
+                        + TextUtility.exceptionToString(e));
+                return null;
+            }
+        }
+
+        /**
+         * 从字节流中加载
+         *
+         * @param byteArray
+         * @param natureIndexArray
+         * @return
+         */
+        public static Attribute create(ByteArray byteArray, Nature[] natureIndexArray) {
+            int currentTotalFrequency = byteArray.nextInt();
+            int length = byteArray.nextInt();
+            Attribute attribute = new Attribute(length);
+            attribute.totalFrequency = currentTotalFrequency;
+            for (int j = 0; j < length; ++j) {
+                attribute.nature[j] = natureIndexArray[byteArray.nextInt()];
+                attribute.frequency[j] = byteArray.nextInt();
+            }
+
+            return attribute;
+        }
+
+        /**
+         * 获取词性的词频
+         *
+         * @param nature 字符串词性
+         * @return 词频
+         * @deprecated 推荐使用Nature参数！
+         */
+        public int getNatureFrequency(String nature) {
+            try {
+                Nature pos = Nature.create(nature);
+                return getNatureFrequency(pos);
+            } catch (IllegalArgumentException e) {
+                return 0;
+            }
+        }
+
+        /**
+         * 获取词性的词频
+         *
+         * @param nature 词性
+         * @return 词频
+         */
+        public int getNatureFrequency(final Nature nature) {
+            int i = 0;
+            for (Nature pos : this.nature) {
+                if (nature == pos) {
+                    return frequency[i];
+                }
+                ++i;
+            }
+            return 0;
+        }
+
+        /**
+         * 是否有某个词性
+         *
+         * @param nature
+         * @return
+         */
+        public boolean hasNature(Nature nature) {
+            return getNatureFrequency(nature) > 0;
+        }
+
+        /**
+         * 是否有以某个前缀开头的词性
+         *
+         * @param prefix 词性前缀，比如u会查询是否有ude, uzhe等等
+         * @return
+         */
+        public boolean hasNatureStartsWith(String prefix) {
+            for (Nature n : nature) {
+                if (n.startsWith(prefix)) {
+                    return true;
+                }
+            }
+            return false;
+        }
+
+        @Override
+        public String toString() {
+            final StringBuilder sb = new StringBuilder();
+            for (int i = 0; i < nature.length; ++i) {
+                sb.append(nature[i]).append(' ').append(frequency[i]).append(' ');
+            }
+            return sb.toString();
+        }
+
+        public void save(DataOutputStream out) throws IOException {
+            out.writeInt(totalFrequency);
+            out.writeInt(nature.length);
+            for (int i = 0; i < nature.length; ++i) {
+                out.writeInt(nature[i].ordinal());
+                out.writeInt(frequency[i]);
+            }
+        }
+    }
+
+    /**
+     * 获取词语的ID
+     *
+     * @param a 词语
+     * @return ID, 如果不存在, 则返回-1
+     */
+    public static int getWordID(String a) {
+        return CoreDictionary.trie.exactMatchSearch(a);
+    }
+
+    /**
+     * 热更新核心词典<br>
+     * 集群环境（或其他IOAdapter）需要自行删除缓存文件
+     *
+     * @return 是否成功
+     */
+    public static boolean reload() {
+        String path = CoreDictionary.PATH;
+        IOUtil.deleteFile(path + Predefine.BIN_EXT);
+
+        return load(path);
+    }
+}
+
--- a/common/src/main/java/com/hankcs/hanlp/seg/WordBasedSegment.java
+++ b/common/src/main/java/com/hankcs/hanlp/seg/WordBasedSegment.java
@@ -0,0 +1,342 @@
+package com.hankcs.hanlp.seg;
+
+
+import com.hankcs.hanlp.algorithm.Viterbi;
+import com.hankcs.hanlp.collection.AhoCorasick.AhoCorasickDoubleArrayTrie;
+import com.hankcs.hanlp.collection.trie.DoubleArrayTrie;
+import com.hankcs.hanlp.corpus.tag.Nature;
+import com.hankcs.hanlp.dictionary.CoreDictionary;
+import com.hankcs.hanlp.seg.common.Term;
+import com.hankcs.hanlp.dictionary.CoreDictionaryTransformMatrixDictionary;
+import com.hankcs.hanlp.dictionary.other.CharType;
+import com.hankcs.hanlp.seg.NShort.Path.AtomNode;
+import com.hankcs.hanlp.seg.common.Graph;
+import com.hankcs.hanlp.seg.common.Vertex;
+import com.hankcs.hanlp.seg.common.WordNet;
+import com.hankcs.hanlp.utility.TextUtility;
+
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.ListIterator;
+
+
+public abstract class WordBasedSegment extends Segment {
+
+    public WordBasedSegment() {
+    }
+
+    protected static void generateWord(List<Vertex> linkedArray, WordNet wordNetOptimum) {
+        fixResultByRule(linkedArray);
+        wordNetOptimum.addAll(linkedArray);
+    }
+
+    protected static void fixResultByRule(List<Vertex> linkedArray) {
+        mergeContinueNumIntoOne(linkedArray);
+        changeDelimiterPOS(linkedArray);
+        splitMiddleSlashFromDigitalWords(linkedArray);
+        checkDateElements(linkedArray);
+    }
+
+    static void changeDelimiterPOS(List<Vertex> linkedArray) {
+        Iterator var1 = linkedArray.iterator();
+
+        while (true) {
+            Vertex vertex;
+            do {
+                if (!var1.hasNext()) {
+                    return;
+                }
+
+                vertex = (Vertex) var1.next();
+            } while (!vertex.realWord.equals("－－") && !vertex.realWord.equals("—") && !vertex.realWord.equals("-"));
+
+            vertex.confirmNature(Nature.w);
+        }
+    }
+
+    private static void splitMiddleSlashFromDigitalWords(List<Vertex> linkedArray) {
+        if (linkedArray.size() >= 2) {
+            ListIterator<Vertex> listIterator = linkedArray.listIterator();
+            Vertex next = (Vertex) listIterator.next();
+
+            for (Vertex current = next; listIterator.hasNext(); current = next) {
+                next = (Vertex) listIterator.next();
+                Nature currentNature = current.getNature();
+                if (currentNature == Nature.nx && (next.hasNature(Nature.q) || next.hasNature(Nature.n))) {
+                    String[] param = current.realWord.split("-", 1);
+                    if (param.length == 2 && TextUtility.isAllNum(param[0]) && TextUtility.isAllNum(param[1])) {
+                        current = current.copy();
+                        current.realWord = param[0];
+                        current.confirmNature(Nature.m);
+                        listIterator.previous();
+                        listIterator.previous();
+                        listIterator.set(current);
+                        listIterator.next();
+                        listIterator.add(Vertex.newPunctuationInstance("-"));
+                        listIterator.add(Vertex.newNumberInstance(param[1]));
+                    }
+                }
+            }
+
+        }
+    }
+
+    private static void checkDateElements(List<Vertex> linkedArray) {
+        if (linkedArray.size() >= 2) {
+            ListIterator<Vertex> listIterator = linkedArray.listIterator();
+            Vertex next = (Vertex) listIterator.next();
+
+            for (Vertex current = next; listIterator.hasNext(); current = next) {
+                next = (Vertex) listIterator.next();
+                if (TextUtility.isAllNum(current.realWord) || TextUtility.isAllChineseNum(current.realWord)) {
+                    String nextWord = next.realWord;
+                    if (nextWord.length() == 1 && "月日时分秒".contains(nextWord)
+                            || nextWord.length() == 2 && nextWord.equals("月份")) {
+                        mergeDate(listIterator, next, current);
+                    } else if (nextWord.equals("年")) {
+                        if (TextUtility.isYearTime(current.realWord)) {
+                            mergeDate(listIterator, next, current);
+                        } else {
+                            current.confirmNature(Nature.m);
+                        }
+                    } else if (current.realWord.endsWith("点")) {
+                        current.confirmNature(Nature.t, true);
+                    } else {
+                        char[] tmpCharArray = current.realWord.toCharArray();
+                        String lastChar = String.valueOf(tmpCharArray[tmpCharArray.length - 1]);
+                        if (!"∶·．／./".contains(lastChar)) {
+                            current.confirmNature(Nature.m, true);
+                        } else if (current.realWord.length() > 1) {
+                            char last = current.realWord.charAt(current.realWord.length() - 1);
+                            current = Vertex.newNumberInstance(
+                                    current.realWord.substring(0, current.realWord.length() - 1));
+                            listIterator.previous();
+                            listIterator.previous();
+                            listIterator.set(current);
+                            listIterator.next();
+                            listIterator.add(Vertex.newPunctuationInstance(String.valueOf(last)));
+                        }
+                    }
+                }
+            }
+
+        }
+    }
+
+    private static void mergeDate(ListIterator<Vertex> listIterator, Vertex next, Vertex current) {
+        current = Vertex.newTimeInstance(current.realWord + next.realWord);
+        listIterator.previous();
+        listIterator.previous();
+        listIterator.set(current);
+        listIterator.next();
+        listIterator.next();
+        listIterator.remove();
+    }
+
+    protected static List<Term> convert(List<Vertex> vertexList) {
+        return Segment.convert(vertexList, false);
+    }
+
+    protected static Graph generateBiGraph(WordNet wordNet) {
+        return wordNet.toGraph();
+    }
+
+    /**
+     * @deprecated
+     */
+    private static List<AtomNode> atomSegment(String sSentence, int start, int end) {
+        if (end < start) {
+            throw new RuntimeException("start=" + start + " < end=" + end);
+        } else {
+            List<AtomNode> atomSegment = new ArrayList();
+            int pCur = 0;
+            StringBuilder sb = new StringBuilder();
+            char[] charArray = sSentence.substring(start, end).toCharArray();
+            int[] charTypeArray = new int[charArray.length];
+
+            for (int i = 0; i < charArray.length; ++i) {
+                char c = charArray[i];
+                charTypeArray[i] = CharType.get(c);
+                if (c == '.' && i < charArray.length - 1 && CharType.get(charArray[i + 1]) == 9) {
+                    charTypeArray[i] = 9;
+                } else if (c == '.' && i < charArray.length - 1 && charArray[i + 1] >= '0' && charArray[i + 1] <= '9') {
+                    charTypeArray[i] = 5;
+                } else if (charTypeArray[i] == 8) {
+                    charTypeArray[i] = 5;
+                }
+            }
+
+            while (true) {
+                while (true) {
+                    while (pCur < charArray.length) {
+                        int nCurType = charTypeArray[pCur];
+                        if (nCurType != 7 && nCurType != 10 && nCurType != 6 && nCurType != 17) {
+                            if (pCur < charArray.length - 1 && (nCurType == 5 || nCurType == 9)) {
+                                sb.delete(0, sb.length());
+                                sb.append(charArray[pCur]);
+                                boolean reachEnd = true;
+
+                                while (pCur < charArray.length - 1) {
+                                    ++pCur;
+                                    int nNextType = charTypeArray[pCur];
+                                    if (nNextType != nCurType) {
+                                        reachEnd = false;
+                                        break;
+                                    }
+
+                                    sb.append(charArray[pCur]);
+                                }
+
+                                atomSegment.add(new AtomNode(sb.toString(), nCurType));
+                                if (reachEnd) {
+                                    ++pCur;
+                                }
+                            } else {
+                                atomSegment.add(new AtomNode(charArray[pCur], nCurType));
+                                ++pCur;
+                            }
+                        } else {
+                            String single = String.valueOf(charArray[pCur]);
+                            if (single.length() != 0) {
+                                atomSegment.add(new AtomNode(single, nCurType));
+                            }
+
+                            ++pCur;
+                        }
+                    }
+
+                    return atomSegment;
+                }
+            }
+        }
+    }
+
+    private static void mergeContinueNumIntoOne(List<Vertex> linkedArray) {
+        if (linkedArray.size() >= 2) {
+            ListIterator<Vertex> listIterator = linkedArray.listIterator();
+            Vertex next = (Vertex) listIterator.next();
+            Vertex current = next;
+
+            while (true) {
+                while (listIterator.hasNext()) {
+                    next = (Vertex) listIterator.next();
+                    if (!TextUtility.isAllNum(current.realWord) && !TextUtility.isAllChineseNum(current.realWord)
+                            || !TextUtility.isAllNum(next.realWord) && !TextUtility.isAllChineseNum(next.realWord)) {
+                        current = next;
+                    } else {
+                        current = Vertex.newNumberInstance(current.realWord + next.realWord);
+                        listIterator.previous();
+                        listIterator.previous();
+                        listIterator.set(current);
+                        listIterator.next();
+                        listIterator.next();
+                        listIterator.remove();
+                    }
+                }
+
+                return;
+            }
+        }
+    }
+
+    protected void generateWordNet(final WordNet wordNetStorage) {
+        final char[] charArray = wordNetStorage.charArray;
+        DoubleArrayTrie.Searcher searcher = CoreDictionary.trie.getSearcher(charArray, 0);
+
+        while (searcher.next()) {
+            wordNetStorage.add(searcher.begin + 1, new Vertex(new String(charArray, searcher.begin, searcher.length),
+                    (CoreDictionary.Attribute) searcher.value, searcher.index));
+        }
+
+        if (this.config.forceCustomDictionary) {
+            this.customDictionary.parseText(charArray, new AhoCorasickDoubleArrayTrie.IHit<CoreDictionary.Attribute>() {
+                public void hit(int begin, int end, CoreDictionary.Attribute value) {
+                    wordNetStorage.add(begin + 1, new Vertex(new String(charArray, begin, end - begin), value));
+                }
+            });
+        }
+
+        LinkedList<Vertex>[] vertexes = wordNetStorage.getVertexes();
+        int i = 1;
+
+        while (true) {
+            while (i < vertexes.length) {
+                if (vertexes[i].isEmpty()) {
+                    int j;
+                    for (j = i + 1;
+                            j < vertexes.length - 1 && (vertexes[j].isEmpty() || CharType.get(charArray[j - 1]) == 11);
+                            ++j) {
+                    }
+
+                    wordNetStorage.add(i, Segment.quickAtomSegment(charArray, i - 1, j - 1));
+                    i = j;
+                } else {
+                    i += ((Vertex) vertexes[i].getLast()).realWord.length();
+                }
+            }
+
+            return;
+        }
+    }
+
+    protected List<Term> decorateResultForIndexMode(List<Vertex> vertexList, WordNet wordNetAll) {
+        List<Term> termList = new LinkedList();
+        int line = 1;
+        ListIterator<Vertex> listIterator = vertexList.listIterator();
+        listIterator.next();
+        int length = vertexList.size() - 2;
+
+        for (int i = 0; i < length; ++i) {
+            Vertex vertex = (Vertex) listIterator.next();
+            Term termMain = Segment.convert(vertex);
+            //termList.add(termMain);
+            addTerms(termList, vertex, line - 1);
+            termMain.offset = line - 1;
+            if (vertex.realWord.length() > 2) {
+                label43:
+                for (int currentLine = line; currentLine < line + vertex.realWord.length(); ++currentLine) {
+                    Iterator iterator = wordNetAll.descendingIterator(currentLine);
+
+                    while (true) {
+                        Vertex smallVertex;
+                        do {
+                            if (!iterator.hasNext()) {
+                                continue label43;
+                            }
+                            smallVertex = (Vertex) iterator.next();
+                        } while ((termMain.nature != Nature.mq || !smallVertex.hasNature(Nature.q))
+                                && smallVertex.realWord.length() < this.config.indexMode);
+
+                        if (smallVertex != vertex
+                                && currentLine + smallVertex.realWord.length() <= line + vertex.realWord.length()) {
+                            listIterator.add(smallVertex);
+                            //Term termSub = convert(smallVertex);
+                            //termSub.offset = currentLine - 1;
+                            //termList.add(termSub);
+                            addTerms(termList, smallVertex, currentLine - 1);
+                        }
+                    }
+                }
+            }
+
+            line += vertex.realWord.length();
+        }
+
+        return termList;
+    }
+
+    protected static void speechTagging(List<Vertex> vertexList) {
+        Viterbi.compute(vertexList, CoreDictionaryTransformMatrixDictionary.transformMatrixDictionary);
+    }
+
+    protected void addTerms(List<Term> terms, Vertex vertex, int offset) {
+        for (int i = 0; i < vertex.attribute.nature.length; i++) {
+            Term term = new Term(vertex.realWord, vertex.attribute.nature[i]);
+            term.setFrequency(vertex.attribute.frequency[i]);
+            term.offset = offset;
+            terms.add(term);
+        }
+    }
+}
--- a/common/src/main/java/com/hankcs/hanlp/seg/common/Term.java
+++ b/common/src/main/java/com/hankcs/hanlp/seg/common/Term.java
@@ -0,0 +1,76 @@
+package com.hankcs.hanlp.seg.common;
+
+import com.hankcs.hanlp.corpus.tag.Nature;
+//import com.hankcs.hanlp.dictionary.CoreDictionary;
+//import com.hankcs.hanlp.dictionary.CustomDictionary;
+//import com.hankcs.hanlp.dictionary.DynamicCustomDictionary;
+import lombok.Data;
+import lombok.ToString;
+
+//import static com.hankcs.hanlp.HanLP.Config.CustomDictionaryPath;
+
+@Data
+@ToString
+public class Term {
+
+    public String word;
+
+    public Nature nature;
+    public int offset;
+    public int frequency = 0;
+
+    public Term(String word, Nature nature) {
+        this.word = word;
+        this.nature = nature;
+    }
+
+    public Term(String word, Nature nature, int offset) {
+        this.word = word;
+        this.nature = nature;
+        this.offset = offset;
+    }
+
+    public Term(String word, Nature nature, int offset, int frequency) {
+        this.word = word;
+        this.nature = nature;
+        this.offset = offset;
+        this.frequency = frequency;
+    }
+
+    public int length() {
+        return this.word.length();
+    }
+
+    public int getFrequency() {
+        if (frequency > 0) {
+            return frequency;
+        }
+        // todo opt
+        /*
+        String wordOri = word.toLowerCase();
+        CoreDictionary.Attribute attribute = getDynamicCustomDictionary().get(wordOri);
+        if (attribute == null) {
+            attribute = CoreDictionary.get(wordOri);
+            if (attribute == null) {
+                attribute = CustomDictionary.get(wordOri);
+            }
+        }
+        if (attribute != null && nature != null && attribute.hasNature(nature)) {
+            return attribute.getNatureFrequency(nature);
+        }
+        return attribute == null ? 0 : attribute.totalFrequency;
+        */
+        return 0;
+    }
+
+    public boolean equals(Object obj) {
+        if (obj instanceof Term) {
+            Term term = (Term) obj;
+            if (this.nature == term.nature && this.word.equals(term.word)) {
+                return true;
+            }
+        }
+        return super.equals(obj);
+    }
+
+}