headless integrates knowledge (#722)

This commit is contained in:
daikon
2024-02-05 20:30:57 +08:00
committed by GitHub
parent 74d0ec2b23
commit 9600456bae
174 changed files with 1908 additions and 1817 deletions

View File

@@ -1,19 +0,0 @@
package com.tencent.supersonic.chat.api.pojo;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
public class RelatedSchemaElement {
private Long dimensionId;
private boolean isNecessary;
}

View File

@@ -1,58 +0,0 @@
package com.tencent.supersonic.chat.api.pojo;
import com.google.common.base.Objects;
import com.google.common.collect.Lists;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.Getter;
import lombok.NoArgsConstructor;
import java.io.Serializable;
import java.util.List;
@Data
@Getter
@Builder
@AllArgsConstructor
@NoArgsConstructor
public class SchemaElement implements Serializable {
private Long view;
private Long id;
private String name;
private String bizName;
private Long useCnt;
private SchemaElementType type;
private List<String> alias;
private List<SchemaValueMap> schemaValueMaps;
private List<RelatedSchemaElement> relatedSchemaElements;
private String defaultAgg;
private double order;
@Override
public boolean equals(Object o) {
if (this == o) {
return true;
}
if (o == null || getClass() != o.getClass()) {
return false;
}
SchemaElement schemaElement = (SchemaElement) o;
return Objects.equal(view, schemaElement.view) && Objects.equal(id,
schemaElement.id) && Objects.equal(name, schemaElement.name)
&& Objects.equal(bizName, schemaElement.bizName)
&& Objects.equal(type, schemaElement.type);
}
@Override
public int hashCode() {
return Objects.hashCode(view, id, name, bizName, type);
}
public List<String> getModelNames() {
return Lists.newArrayList(name);
}
}

View File

@@ -1,5 +1,6 @@
package com.tencent.supersonic.chat.api.pojo;
import com.tencent.supersonic.headless.api.pojo.SchemaElement;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;

View File

@@ -1,12 +0,0 @@
package com.tencent.supersonic.chat.api.pojo;
public enum SchemaElementType {
VIEW,
METRIC,
DIMENSION,
VALUE,
ENTITY,
TAG,
ID,
DATE
}

View File

@@ -1,24 +0,0 @@
package com.tencent.supersonic.chat.api.pojo;
import java.util.ArrayList;
import java.util.List;
import lombok.Data;
@Data
public class SchemaValueMap {
/**
* dimension value in db
*/
private String techName;
/**
* dimension value for result show
*/
private String bizName;
/**
* dimension value for user query
*/
private List<String> alias = new ArrayList<>();
}

View File

@@ -9,6 +9,7 @@ import com.tencent.supersonic.common.pojo.Order;
import com.tencent.supersonic.common.pojo.enums.QueryType;
import com.tencent.supersonic.common.pojo.enums.AggregateTypeEnum;
import com.tencent.supersonic.common.pojo.enums.FilterType;
import com.tencent.supersonic.headless.api.pojo.SchemaElement;
import lombok.Data;
import java.util.ArrayList;

View File

@@ -1,5 +1,7 @@
package com.tencent.supersonic.chat.api.pojo;
import com.tencent.supersonic.headless.api.pojo.SchemaElement;
import com.tencent.supersonic.headless.api.pojo.SchemaElementType;
import org.springframework.util.CollectionUtils;
import java.io.Serializable;

View File

@@ -1,6 +1,8 @@
package com.tencent.supersonic.chat.api.pojo;
import com.tencent.supersonic.headless.api.pojo.QueryConfig;
import com.tencent.supersonic.headless.api.pojo.SchemaElement;
import com.tencent.supersonic.headless.api.pojo.SchemaElementType;
import com.tencent.supersonic.headless.api.pojo.TagTypeDefaultConfig;
import com.tencent.supersonic.headless.api.pojo.TimeDefaultConfig;
import lombok.Data;

View File

@@ -1,23 +0,0 @@
package com.tencent.supersonic.chat.api.pojo.request;
import lombok.Data;
import lombok.NoArgsConstructor;
import lombok.ToString;
import javax.validation.constraints.NotNull;
import java.util.List;
import static java.time.LocalDate.now;
@ToString
@Data
@NoArgsConstructor
public class DictLatestTaskReq {
@NotNull
private Long modelId;
private List<Long> dimIds;
private String createdAt = now().plusDays(-4).toString();
}

View File

@@ -1,20 +0,0 @@
package com.tencent.supersonic.chat.api.pojo.request;
import com.tencent.supersonic.common.pojo.enums.TaskStatusEnum;
import lombok.Data;
import lombok.ToString;
@ToString
@Data
public class DictTaskFilterReq {
private Long id;
private String name;
private String createdBy;
private String createdAt;
private TaskStatusEnum status;
}

View File

@@ -1,20 +0,0 @@
package com.tencent.supersonic.chat.api.pojo.request;
import javax.validation.constraints.NotNull;
import lombok.Data;
@Data
public class DimensionValueReq {
private Integer agentId;
@NotNull
private Long elementID;
private Long modelId;
private String bizName;
@NotNull
private String value;
}

View File

@@ -2,7 +2,7 @@ package com.tencent.supersonic.chat.api.pojo.request;
import com.tencent.supersonic.auth.api.authentication.pojo.User;
import com.tencent.supersonic.chat.api.pojo.SchemaElement;
import com.tencent.supersonic.headless.api.pojo.SchemaElement;
import com.tencent.supersonic.common.pojo.DateConf;
import java.util.HashSet;
import java.util.Set;

View File

@@ -1,7 +1,7 @@
package com.tencent.supersonic.chat.api.pojo.response;
import com.tencent.supersonic.chat.api.pojo.SchemaElement;
import com.tencent.supersonic.headless.api.pojo.SchemaElement;
import com.tencent.supersonic.common.pojo.Constants;
import com.tencent.supersonic.common.pojo.enums.TimeMode;
import lombok.Data;

View File

@@ -1,6 +1,6 @@
package com.tencent.supersonic.chat.api.pojo.response;
import com.tencent.supersonic.chat.api.pojo.SchemaElement;
import com.tencent.supersonic.headless.api.pojo.SchemaElement;
import java.util.List;
import lombok.Data;

View File

@@ -1,6 +1,6 @@
package com.tencent.supersonic.chat.api.pojo.response;
import com.tencent.supersonic.chat.api.pojo.SchemaElement;
import com.tencent.supersonic.headless.api.pojo.SchemaElement;
import com.tencent.supersonic.chat.api.pojo.SemanticParseInfo;
import com.tencent.supersonic.common.pojo.QueryAuthorization;
import com.tencent.supersonic.common.pojo.QueryColumn;

View File

@@ -1,6 +1,6 @@
package com.tencent.supersonic.chat.api.pojo.response;
import com.tencent.supersonic.chat.api.pojo.SchemaElement;
import com.tencent.supersonic.headless.api.pojo.SchemaElement;
import lombok.Data;
import java.util.List;

View File

@@ -1,6 +1,6 @@
package com.tencent.supersonic.chat.api.pojo.response;
import com.tencent.supersonic.chat.api.pojo.SchemaElementType;
import com.tencent.supersonic.headless.api.pojo.SchemaElementType;
import java.util.Objects;
import lombok.Builder;
import lombok.Data;

View File

@@ -21,70 +21,6 @@
<groupId>org.springframework</groupId>
<artifactId>spring-context</artifactId>
</dependency>
<dependency>
<groupId>com.hankcs</groupId>
<artifactId>hanlp</artifactId>
<version>${hanlp.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>${hadoop.version}</version>
<exclusions>
<exclusion>
<groupId>org.apache.zookeeper</groupId>
<artifactId>zookeeper</artifactId>
</exclusion>
<exclusion>
<groupId>javax.servlet</groupId>
<artifactId>servlet-api</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>${hadoop.version}</version>
<exclusions>
<exclusion>
<groupId>org.apache.zookeeper</groupId>
<artifactId>zookeeper</artifactId>
</exclusion>
<exclusion>
<groupId>javax.servlet</groupId>
<artifactId>servlet-api</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>${hadoop.version}</version>
<exclusions>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
</exclusion>
<exclusion>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.zookeeper</groupId>
<artifactId>zookeeper</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.curator</groupId>
<artifactId>*</artifactId>
</exclusion>
<exclusion>
<groupId>javax.servlet</groupId>
<artifactId>servlet-api</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.testng</groupId>
<artifactId>testng</artifactId>

View File

@@ -1,334 +0,0 @@
package com.hankcs.hanlp.collection.trie.bintrie;
import com.hankcs.hanlp.corpus.io.ByteArray;
import com.tencent.supersonic.chat.core.knowledge.LoadRemoveService;
import com.tencent.supersonic.common.util.ContextUtils;
import java.io.DataOutputStream;
import java.io.IOException;
import java.io.ObjectInput;
import java.io.ObjectOutput;
import java.util.AbstractMap;
import java.util.ArrayDeque;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Queue;
import java.util.Set;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public abstract class BaseNode<V> implements Comparable<BaseNode> {
/**
* 状态数组,方便读取的时候用
*/
static final Status[] ARRAY_STATUS = Status.values();
private static final Logger logger = LoggerFactory.getLogger(BaseNode.class);
/**
* 子节点
*/
protected BaseNode[] child;
/**
* 节点状态
*/
protected Status status;
/**
* 节点代表的字符
*/
protected char c;
/**
* 节点代表的值
*/
protected V value;
protected String prefix = null;
public BaseNode<V> transition(String path, int begin) {
BaseNode<V> cur = this;
for (int i = begin; i < path.length(); ++i) {
cur = cur.getChild(path.charAt(i));
if (cur == null || cur.status == Status.UNDEFINED_0) {
return null;
}
}
return cur;
}
public BaseNode<V> transition(char[] path, int begin) {
BaseNode<V> cur = this;
for (int i = begin; i < path.length; ++i) {
cur = cur.getChild(path[i]);
if (cur == null || cur.status == Status.UNDEFINED_0) {
return null;
}
}
return cur;
}
/**
* 转移状态
*
* @param path
* @return
*/
public BaseNode<V> transition(char path) {
BaseNode<V> cur = this;
cur = cur.getChild(path);
if (cur == null || cur.status == Status.UNDEFINED_0) {
return null;
}
return cur;
}
/**
* 添加子节点
*
* @return true-新增了节点 false-修改了现有节点
*/
protected abstract boolean addChild(BaseNode node);
/**
* 是否含有子节点
*
* @param c 子节点的char
* @return 是否含有
*/
protected boolean hasChild(char c) {
return getChild(c) != null;
}
protected char getChar() {
return c;
}
/**
* 获取子节点
*
* @param c 子节点的char
* @return 子节点
*/
public abstract BaseNode getChild(char c);
/**
* 获取节点对应的值
*
* @return 值
*/
public final V getValue() {
return value;
}
/**
* 设置节点对应的值
*
* @param value 值
*/
public final void setValue(V value) {
this.value = value;
}
@Override
public int compareTo(BaseNode other) {
return compareTo(other.getChar());
}
/**
* 重载,与字符的比较
*
* @param other
* @return
*/
public int compareTo(char other) {
if (this.c > other) {
return 1;
}
if (this.c < other) {
return -1;
}
return 0;
}
/**
* 获取节点的成词状态
*
* @return
*/
public Status getStatus() {
return status;
}
protected void walk(StringBuilder sb, Set<Map.Entry<String, V>> entrySet) {
sb.append(c);
if (status == Status.WORD_MIDDLE_2 || status == Status.WORD_END_3) {
entrySet.add(new TrieEntry(sb.toString(), value));
}
if (child == null) {
return;
}
for (BaseNode node : child) {
if (node == null) {
continue;
}
node.walk(new StringBuilder(sb.toString()), entrySet);
}
}
protected void walkToSave(DataOutputStream out) throws IOException {
out.writeChar(c);
out.writeInt(status.ordinal());
int childSize = 0;
if (child != null) {
childSize = child.length;
}
out.writeInt(childSize);
if (child == null) {
return;
}
for (BaseNode node : child) {
node.walkToSave(out);
}
}
protected void walkToSave(ObjectOutput out) throws IOException {
out.writeChar(c);
out.writeInt(status.ordinal());
if (status == Status.WORD_END_3 || status == Status.WORD_MIDDLE_2) {
out.writeObject(value);
}
int childSize = 0;
if (child != null) {
childSize = child.length;
}
out.writeInt(childSize);
if (child == null) {
return;
}
for (BaseNode node : child) {
node.walkToSave(out);
}
}
protected void walkToLoad(ByteArray byteArray, _ValueArray<V> valueArray) {
c = byteArray.nextChar();
status = ARRAY_STATUS[byteArray.nextInt()];
if (status == Status.WORD_END_3 || status == Status.WORD_MIDDLE_2) {
value = valueArray.nextValue();
}
int childSize = byteArray.nextInt();
child = new BaseNode[childSize];
for (int i = 0; i < childSize; ++i) {
child[i] = new Node<V>();
child[i].walkToLoad(byteArray, valueArray);
}
}
protected void walkToLoad(ObjectInput byteArray) throws IOException, ClassNotFoundException {
c = byteArray.readChar();
status = ARRAY_STATUS[byteArray.readInt()];
if (status == Status.WORD_END_3 || status == Status.WORD_MIDDLE_2) {
value = (V) byteArray.readObject();
}
int childSize = byteArray.readInt();
child = new BaseNode[childSize];
for (int i = 0; i < childSize; ++i) {
child[i] = new Node<V>();
child[i].walkToLoad(byteArray);
}
}
public enum Status {
/**
* 未指定,用于删除词条
*/
UNDEFINED_0,
/**
* 不是词语的结尾
*/
NOT_WORD_1,
/**
* 是个词语的结尾,并且还可以继续
*/
WORD_MIDDLE_2,
/**
* 是个词语的结尾,并且没有继续
*/
WORD_END_3,
}
public class TrieEntry extends AbstractMap.SimpleEntry<String, V> implements Comparable<TrieEntry> {
public TrieEntry(String key, V value) {
super(key, value);
}
@Override
public int compareTo(TrieEntry o) {
return getKey().compareTo(String.valueOf(o.getKey()));
}
}
@Override
public String toString() {
return "BaseNode{"
+ "child="
+ Arrays.toString(child)
+ ", status="
+ status
+ ", c="
+ c
+ ", value="
+ value
+ ", prefix='"
+ prefix
+ '\''
+ '}';
}
public void walkNode(Set<Map.Entry<String, V>> entrySet, Integer agentId, Set<Long> detectModelIds) {
if (status == Status.WORD_MIDDLE_2 || status == Status.WORD_END_3) {
LoadRemoveService loadRemoveService = ContextUtils.getBean(LoadRemoveService.class);
logger.debug("agentId:{},detectModelIds:{},before:{}", agentId, detectModelIds, value.toString());
List natures = loadRemoveService.removeNatures((List) value, agentId, detectModelIds);
String name = this.prefix != null ? this.prefix + c : "" + c;
logger.debug("name:{},after:{},natures:{}", name, (List) value, natures);
entrySet.add(new TrieEntry(name, (V) natures));
}
}
/***
* walk limit
* @param sb
* @param entrySet
* @param limit
*/
public void walkLimit(StringBuilder sb, Set<Map.Entry<String, V>> entrySet, int limit, Integer agentId,
Set<Long> detectModelIds) {
Queue<BaseNode> queue = new ArrayDeque<>();
this.prefix = sb.toString();
queue.add(this);
while (!queue.isEmpty()) {
if (entrySet.size() >= limit) {
break;
}
BaseNode root = queue.poll();
if (root == null) {
continue;
}
root.walkNode(entrySet, agentId, detectModelIds);
if (root.child == null) {
continue;
}
String prefix = root.prefix + root.c;
for (BaseNode node : root.child) {
if (Objects.nonNull(node)) {
node.prefix = prefix;
queue.add(node);
}
}
}
}
}

View File

@@ -1,393 +0,0 @@
package com.hankcs.hanlp.dictionary;
import static com.hankcs.hanlp.utility.Predefine.logger;
import com.hankcs.hanlp.HanLP;
import com.hankcs.hanlp.collection.trie.DoubleArrayTrie;
import com.hankcs.hanlp.corpus.io.ByteArray;
import com.hankcs.hanlp.corpus.io.IOUtil;
import com.hankcs.hanlp.corpus.tag.Nature;
import com.hankcs.hanlp.utility.Predefine;
import com.hankcs.hanlp.utility.TextUtility;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.DataOutputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Serializable;
import java.util.Collection;
import java.util.TreeMap;
/**
* 使用DoubleArrayTrie实现的核心词典
*/
public class CoreDictionary {
public static DoubleArrayTrie<Attribute> trie = new DoubleArrayTrie<Attribute>();
public static final String PATH = HanLP.Config.CoreDictionaryPath;
// 自动加载词典
static {
long start = System.currentTimeMillis();
if (!load(PATH)) {
throw new IllegalArgumentException("核心词典" + PATH + "加载失败");
} else {
logger.info(PATH + "加载成功," + trie.size() + "个词条,耗时" + (System.currentTimeMillis() - start) + "ms");
}
}
// 一些特殊的WORD_ID
public static final int NR_WORD_ID = getWordID(Predefine.TAG_PEOPLE);
public static final int NS_WORD_ID = getWordID(Predefine.TAG_PLACE);
public static final int NT_WORD_ID = getWordID(Predefine.TAG_GROUP);
public static final int T_WORD_ID = getWordID(Predefine.TAG_TIME);
public static final int X_WORD_ID = getWordID(Predefine.TAG_CLUSTER);
public static final int M_WORD_ID = getWordID(Predefine.TAG_NUMBER);
public static final int NX_WORD_ID = getWordID(Predefine.TAG_PROPER);
private static boolean load(String path) {
logger.info("核心词典开始加载:" + path);
if (loadDat(path)) {
return true;
}
TreeMap<String, Attribute> map = new TreeMap<String, Attribute>();
BufferedReader br = null;
try {
br = new BufferedReader(new InputStreamReader(IOUtil.newInputStream(path), "UTF-8"));
String line;
int totalFrequency = 0;
long start = System.currentTimeMillis();
while ((line = br.readLine()) != null) {
String[] param = line.split("\\s");
int natureCount = (param.length - 1) / 2;
Attribute attribute = new Attribute(natureCount);
for (int i = 0; i < natureCount; ++i) {
attribute.nature[i] = Nature.create(param[1 + 2 * i]);
attribute.frequency[i] = Integer.parseInt(param[2 + 2 * i]);
attribute.totalFrequency += attribute.frequency[i];
}
map.put(param[0], attribute);
totalFrequency += attribute.totalFrequency;
}
logger.info(
"核心词典读入词条" + map.size() + " 全部频次" + totalFrequency + ",耗时" + (System.currentTimeMillis() - start)
+ "ms");
br.close();
trie.build(map);
logger.info("核心词典加载成功:" + trie.size() + "个词条,下面将写入缓存……");
try {
DataOutputStream out = new DataOutputStream(
new BufferedOutputStream(IOUtil.newOutputStream(path + Predefine.BIN_EXT)));
Collection<Attribute> attributeList = map.values();
out.writeInt(attributeList.size());
for (Attribute attribute : attributeList) {
out.writeInt(attribute.totalFrequency);
out.writeInt(attribute.nature.length);
for (int i = 0; i < attribute.nature.length; ++i) {
out.writeInt(attribute.nature[i].ordinal());
out.writeInt(attribute.frequency[i]);
}
}
trie.save(out);
out.writeInt(totalFrequency);
Predefine.setTotalFrequency(totalFrequency);
out.close();
} catch (Exception e) {
logger.warning("保存失败" + e);
return false;
}
} catch (FileNotFoundException e) {
logger.warning("核心词典" + path + "不存在!" + e);
return false;
} catch (IOException e) {
logger.warning("核心词典" + path + "读取错误!" + e);
return false;
}
return true;
}
/**
* 从磁盘加载双数组
*
* @param path
* @return
*/
static boolean loadDat(String path) {
try {
ByteArray byteArray = ByteArray.createByteArray(path + Predefine.BIN_EXT);
if (byteArray == null) {
return false;
}
int size = byteArray.nextInt();
Attribute[] attributes = new Attribute[size];
final Nature[] natureIndexArray = Nature.values();
for (int i = 0; i < size; ++i) {
// 第一个是全部频次,第二个是词性个数
int currentTotalFrequency = byteArray.nextInt();
int length = byteArray.nextInt();
attributes[i] = new Attribute(length);
attributes[i].totalFrequency = currentTotalFrequency;
for (int j = 0; j < length; ++j) {
attributes[i].nature[j] = natureIndexArray[byteArray.nextInt()];
attributes[i].frequency[j] = byteArray.nextInt();
}
}
if (!trie.load(byteArray, attributes)) {
return false;
}
int totalFrequency = 0;
if (byteArray.hasMore()) {
totalFrequency = byteArray.nextInt();
} else {
for (Attribute attribute : attributes) {
totalFrequency += attribute.totalFrequency;
}
}
Predefine.setTotalFrequency(totalFrequency);
} catch (Exception e) {
logger.warning("读取失败,问题发生在" + e);
return false;
}
return true;
}
/**
* 获取条目
*
* @param key
* @return
*/
public static Attribute get(String key) {
return trie.get(key);
}
/**
* 获取条目
*
* @param wordID
* @return
*/
public static Attribute get(int wordID) {
return trie.get(wordID);
}
/**
* 获取词频
*
* @param term
* @return
*/
public static int getTermFrequency(String term) {
Attribute attribute = get(term);
if (attribute == null) {
return 0;
}
return attribute.totalFrequency;
}
/**
* 是否包含词语
*
* @param key
* @return
*/
public static boolean contains(String key) {
return trie.get(key) != null;
}
/**
* 核心词典中的词属性
*/
public static class Attribute implements Serializable {
/**
* 词性列表
*/
public Nature[] nature;
/**
* 词性对应的词频
*/
public int[] frequency;
public int totalFrequency;
public String original = null;
public Attribute(int size) {
nature = new Nature[size];
frequency = new int[size];
}
public Attribute(Nature[] nature, int[] frequency) {
this.nature = nature;
this.frequency = frequency;
}
public Attribute(Nature nature, int frequency) {
this(1);
this.nature[0] = nature;
this.frequency[0] = frequency;
totalFrequency = frequency;
}
public Attribute(Nature[] nature, int[] frequency, int totalFrequency) {
this.nature = nature;
this.frequency = frequency;
this.totalFrequency = totalFrequency;
}
/**
* 使用单个词性默认词频1000构造
*
* @param nature
*/
public Attribute(Nature nature) {
this(nature, 1000);
}
public static Attribute create(String natureWithFrequency) {
try {
String[] param = natureWithFrequency.split(" ");
if (param.length % 2 != 0) {
return new Attribute(Nature.create(natureWithFrequency.trim()), 1); // 儿童锁
}
int natureCount = param.length / 2;
Attribute attribute = new Attribute(natureCount);
for (int i = 0; i < natureCount; ++i) {
attribute.nature[i] = Nature.create(param[2 * i]);
attribute.frequency[i] = Integer.parseInt(param[1 + 2 * i]);
attribute.totalFrequency += attribute.frequency[i];
}
return attribute;
} catch (Exception e) {
logger.warning("使用字符串" + natureWithFrequency + "创建词条属性失败!" + TextUtility.exceptionToString(e));
return null;
}
}
/**
* 从字节流中加载
*
* @param byteArray
* @param natureIndexArray
* @return
*/
public static Attribute create(ByteArray byteArray, Nature[] natureIndexArray) {
int currentTotalFrequency = byteArray.nextInt();
int length = byteArray.nextInt();
Attribute attribute = new Attribute(length);
attribute.totalFrequency = currentTotalFrequency;
for (int j = 0; j < length; ++j) {
attribute.nature[j] = natureIndexArray[byteArray.nextInt()];
attribute.frequency[j] = byteArray.nextInt();
}
return attribute;
}
/**
* 获取词性的词频
*
* @param nature 字符串词性
* @return 词频
* @deprecated 推荐使用Nature参数
*/
public int getNatureFrequency(String nature) {
try {
Nature pos = Nature.create(nature);
return getNatureFrequency(pos);
} catch (IllegalArgumentException e) {
return 0;
}
}
/**
* 获取词性的词频
*
* @param nature 词性
* @return 词频
*/
public int getNatureFrequency(final Nature nature) {
int i = 0;
for (Nature pos : this.nature) {
if (nature == pos) {
return frequency[i];
}
++i;
}
return 0;
}
/**
* 是否有某个词性
*
* @param nature
* @return
*/
public boolean hasNature(Nature nature) {
return getNatureFrequency(nature) > 0;
}
/**
* 是否有以某个前缀开头的词性
*
* @param prefix 词性前缀比如u会查询是否有ude, uzhe等等
* @return
*/
public boolean hasNatureStartsWith(String prefix) {
for (Nature n : nature) {
if (n.startsWith(prefix)) {
return true;
}
}
return false;
}
@Override
public String toString() {
final StringBuilder sb = new StringBuilder();
for (int i = 0; i < nature.length; ++i) {
sb.append(nature[i]).append(' ').append(frequency[i]).append(' ');
}
return sb.toString();
}
public void save(DataOutputStream out) throws IOException {
out.writeInt(totalFrequency);
out.writeInt(nature.length);
for (int i = 0; i < nature.length; ++i) {
out.writeInt(nature[i].ordinal());
out.writeInt(frequency[i]);
}
}
}
/**
* 获取词语的ID
*
* @param a 词语
* @return ID, 如果不存在, 则返回-1
*/
public static int getWordID(String a) {
return CoreDictionary.trie.exactMatchSearch(a);
}
/**
* 热更新核心词典<br>
* 集群环境或其他IOAdapter需要自行删除缓存文件
*
* @return 是否成功
*/
public static boolean reload() {
String path = CoreDictionary.PATH;
IOUtil.deleteFile(path + Predefine.BIN_EXT);
return load(path);
}
}

View File

@@ -1,341 +0,0 @@
package com.hankcs.hanlp.seg;
import com.hankcs.hanlp.algorithm.Viterbi;
import com.hankcs.hanlp.collection.AhoCorasick.AhoCorasickDoubleArrayTrie;
import com.hankcs.hanlp.collection.trie.DoubleArrayTrie;
import com.hankcs.hanlp.corpus.tag.Nature;
import com.hankcs.hanlp.dictionary.CoreDictionary;
import com.hankcs.hanlp.dictionary.CoreDictionaryTransformMatrixDictionary;
import com.hankcs.hanlp.dictionary.other.CharType;
import com.hankcs.hanlp.seg.NShort.Path.AtomNode;
import com.hankcs.hanlp.seg.common.Graph;
import com.hankcs.hanlp.seg.common.Term;
import com.hankcs.hanlp.seg.common.Vertex;
import com.hankcs.hanlp.seg.common.WordNet;
import com.hankcs.hanlp.utility.TextUtility;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.ListIterator;
public abstract class WordBasedSegment extends Segment {
public WordBasedSegment() {
}
protected static void generateWord(List<Vertex> linkedArray, WordNet wordNetOptimum) {
fixResultByRule(linkedArray);
wordNetOptimum.addAll(linkedArray);
}
protected static void fixResultByRule(List<Vertex> linkedArray) {
mergeContinueNumIntoOne(linkedArray);
changeDelimiterPOS(linkedArray);
splitMiddleSlashFromDigitalWords(linkedArray);
checkDateElements(linkedArray);
}
static void changeDelimiterPOS(List<Vertex> linkedArray) {
Iterator var1 = linkedArray.iterator();
while (true) {
Vertex vertex;
do {
if (!var1.hasNext()) {
return;
}
vertex = (Vertex) var1.next();
} while (!vertex.realWord.equals("") && !vertex.realWord.equals("") && !vertex.realWord.equals("-"));
vertex.confirmNature(Nature.w);
}
}
private static void splitMiddleSlashFromDigitalWords(List<Vertex> linkedArray) {
if (linkedArray.size() >= 2) {
ListIterator<Vertex> listIterator = linkedArray.listIterator();
Vertex next = (Vertex) listIterator.next();
for (Vertex current = next; listIterator.hasNext(); current = next) {
next = (Vertex) listIterator.next();
Nature currentNature = current.getNature();
if (currentNature == Nature.nx && (next.hasNature(Nature.q) || next.hasNature(Nature.n))) {
String[] param = current.realWord.split("-", 1);
if (param.length == 2 && TextUtility.isAllNum(param[0]) && TextUtility.isAllNum(param[1])) {
current = current.copy();
current.realWord = param[0];
current.confirmNature(Nature.m);
listIterator.previous();
listIterator.previous();
listIterator.set(current);
listIterator.next();
listIterator.add(Vertex.newPunctuationInstance("-"));
listIterator.add(Vertex.newNumberInstance(param[1]));
}
}
}
}
}
private static void checkDateElements(List<Vertex> linkedArray) {
if (linkedArray.size() >= 2) {
ListIterator<Vertex> listIterator = linkedArray.listIterator();
Vertex next = (Vertex) listIterator.next();
for (Vertex current = next; listIterator.hasNext(); current = next) {
next = (Vertex) listIterator.next();
if (TextUtility.isAllNum(current.realWord) || TextUtility.isAllChineseNum(current.realWord)) {
String nextWord = next.realWord;
if (nextWord.length() == 1 && "月日时分秒".contains(nextWord)
|| nextWord.length() == 2 && nextWord.equals("月份")) {
mergeDate(listIterator, next, current);
} else if (nextWord.equals("")) {
if (TextUtility.isYearTime(current.realWord)) {
mergeDate(listIterator, next, current);
} else {
current.confirmNature(Nature.m);
}
} else if (current.realWord.endsWith("")) {
current.confirmNature(Nature.t, true);
} else {
char[] tmpCharArray = current.realWord.toCharArray();
String lastChar = String.valueOf(tmpCharArray[tmpCharArray.length - 1]);
if (!"∶·././".contains(lastChar)) {
current.confirmNature(Nature.m, true);
} else if (current.realWord.length() > 1) {
char last = current.realWord.charAt(current.realWord.length() - 1);
current = Vertex.newNumberInstance(
current.realWord.substring(0, current.realWord.length() - 1));
listIterator.previous();
listIterator.previous();
listIterator.set(current);
listIterator.next();
listIterator.add(Vertex.newPunctuationInstance(String.valueOf(last)));
}
}
}
}
}
}
private static void mergeDate(ListIterator<Vertex> listIterator, Vertex next, Vertex current) {
current = Vertex.newTimeInstance(current.realWord + next.realWord);
listIterator.previous();
listIterator.previous();
listIterator.set(current);
listIterator.next();
listIterator.next();
listIterator.remove();
}
protected static List<Term> convert(List<Vertex> vertexList) {
return convert(vertexList, false);
}
protected static Graph generateBiGraph(WordNet wordNet) {
return wordNet.toGraph();
}
/**
* @deprecated
*/
private static List<AtomNode> atomSegment(String sSentence, int start, int end) {
if (end < start) {
throw new RuntimeException("start=" + start + " < end=" + end);
} else {
List<AtomNode> atomSegment = new ArrayList();
int pCur = 0;
StringBuilder sb = new StringBuilder();
char[] charArray = sSentence.substring(start, end).toCharArray();
int[] charTypeArray = new int[charArray.length];
for (int i = 0; i < charArray.length; ++i) {
char c = charArray[i];
charTypeArray[i] = CharType.get(c);
if (c == '.' && i < charArray.length - 1 && CharType.get(charArray[i + 1]) == 9) {
charTypeArray[i] = 9;
} else if (c == '.' && i < charArray.length - 1 && charArray[i + 1] >= '0' && charArray[i + 1] <= '9') {
charTypeArray[i] = 5;
} else if (charTypeArray[i] == 8) {
charTypeArray[i] = 5;
}
}
while (true) {
while (true) {
while (pCur < charArray.length) {
int nCurType = charTypeArray[pCur];
if (nCurType != 7 && nCurType != 10 && nCurType != 6 && nCurType != 17) {
if (pCur < charArray.length - 1 && (nCurType == 5 || nCurType == 9)) {
sb.delete(0, sb.length());
sb.append(charArray[pCur]);
boolean reachEnd = true;
while (pCur < charArray.length - 1) {
++pCur;
int nNextType = charTypeArray[pCur];
if (nNextType != nCurType) {
reachEnd = false;
break;
}
sb.append(charArray[pCur]);
}
atomSegment.add(new AtomNode(sb.toString(), nCurType));
if (reachEnd) {
++pCur;
}
} else {
atomSegment.add(new AtomNode(charArray[pCur], nCurType));
++pCur;
}
} else {
String single = String.valueOf(charArray[pCur]);
if (single.length() != 0) {
atomSegment.add(new AtomNode(single, nCurType));
}
++pCur;
}
}
return atomSegment;
}
}
}
}
private static void mergeContinueNumIntoOne(List<Vertex> linkedArray) {
if (linkedArray.size() >= 2) {
ListIterator<Vertex> listIterator = linkedArray.listIterator();
Vertex next = (Vertex) listIterator.next();
Vertex current = next;
while (true) {
while (listIterator.hasNext()) {
next = (Vertex) listIterator.next();
if (!TextUtility.isAllNum(current.realWord) && !TextUtility.isAllChineseNum(current.realWord)
|| !TextUtility.isAllNum(next.realWord) && !TextUtility.isAllChineseNum(next.realWord)) {
current = next;
} else {
current = Vertex.newNumberInstance(current.realWord + next.realWord);
listIterator.previous();
listIterator.previous();
listIterator.set(current);
listIterator.next();
listIterator.next();
listIterator.remove();
}
}
return;
}
}
}
protected void generateWordNet(final WordNet wordNetStorage) {
final char[] charArray = wordNetStorage.charArray;
DoubleArrayTrie.Searcher searcher = CoreDictionary.trie.getSearcher(charArray, 0);
while (searcher.next()) {
wordNetStorage.add(searcher.begin + 1, new Vertex(new String(charArray, searcher.begin, searcher.length),
(CoreDictionary.Attribute) searcher.value, searcher.index));
}
if (this.config.forceCustomDictionary) {
this.customDictionary.parseText(charArray, new AhoCorasickDoubleArrayTrie.IHit<CoreDictionary.Attribute>() {
public void hit(int begin, int end, CoreDictionary.Attribute value) {
wordNetStorage.add(begin + 1, new Vertex(new String(charArray, begin, end - begin), value));
}
});
}
LinkedList<Vertex>[] vertexes = wordNetStorage.getVertexes();
int i = 1;
while (true) {
while (i < vertexes.length) {
if (vertexes[i].isEmpty()) {
int j;
for (j = i + 1;
j < vertexes.length - 1 && (vertexes[j].isEmpty() || CharType.get(charArray[j - 1]) == 11);
++j) {
}
wordNetStorage.add(i, quickAtomSegment(charArray, i - 1, j - 1));
i = j;
} else {
i += ((Vertex) vertexes[i].getLast()).realWord.length();
}
}
return;
}
}
protected List<Term> decorateResultForIndexMode(List<Vertex> vertexList, WordNet wordNetAll) {
List<Term> termList = new LinkedList();
int line = 1;
ListIterator<Vertex> listIterator = vertexList.listIterator();
listIterator.next();
int length = vertexList.size() - 2;
for (int i = 0; i < length; ++i) {
Vertex vertex = (Vertex) listIterator.next();
Term termMain = convert(vertex);
//termList.add(termMain);
addTerms(termList, vertex, line - 1);
termMain.offset = line - 1;
if (vertex.realWord.length() > 2) {
label43:
for (int currentLine = line; currentLine < line + vertex.realWord.length(); ++currentLine) {
Iterator iterator = wordNetAll.descendingIterator(currentLine);
while (true) {
Vertex smallVertex;
do {
if (!iterator.hasNext()) {
continue label43;
}
smallVertex = (Vertex) iterator.next();
} while ((termMain.nature != Nature.mq || !smallVertex.hasNature(Nature.q))
&& smallVertex.realWord.length() < this.config.indexMode);
if (smallVertex != vertex
&& currentLine + smallVertex.realWord.length() <= line + vertex.realWord.length()) {
listIterator.add(smallVertex);
//Term termSub = convert(smallVertex);
//termSub.offset = currentLine - 1;
//termList.add(termSub);
addTerms(termList, smallVertex, currentLine - 1);
}
}
}
}
line += vertex.realWord.length();
}
return termList;
}
protected static void speechTagging(List<Vertex> vertexList) {
Viterbi.compute(vertexList, CoreDictionaryTransformMatrixDictionary.transformMatrixDictionary);
}
protected void addTerms(List<Term> terms, Vertex vertex, int offset) {
for (int i = 0; i < vertex.attribute.nature.length; i++) {
Term term = new Term(vertex.realWord, vertex.attribute.nature[i]);
term.setFrequency(vertex.attribute.frequency[i]);
term.offset = offset;
terms.add(term);
}
}
}

View File

@@ -1,69 +0,0 @@
package com.hankcs.hanlp.seg.common;
import com.hankcs.hanlp.corpus.tag.Nature;
import com.hankcs.hanlp.dictionary.CoreDictionary;
import com.hankcs.hanlp.dictionary.CustomDictionary;
import com.tencent.supersonic.chat.core.utils.HanlpHelper;
import lombok.Data;
import lombok.ToString;
@Data
@ToString
public class Term {
public String word;
public Nature nature;
public int offset;
public int frequency = 0;
public Term(String word, Nature nature) {
this.word = word;
this.nature = nature;
}
public Term(String word, Nature nature, int offset) {
this.word = word;
this.nature = nature;
this.offset = offset;
}
public Term(String word, Nature nature, int offset, int frequency) {
this.word = word;
this.nature = nature;
this.offset = offset;
this.frequency = frequency;
}
public int length() {
return this.word.length();
}
public int getFrequency() {
if (frequency > 0) {
return frequency;
}
String wordOri = word.toLowerCase();
CoreDictionary.Attribute attribute = HanlpHelper.getDynamicCustomDictionary().get(wordOri);
if (attribute == null) {
attribute = CoreDictionary.get(wordOri);
if (attribute == null) {
attribute = CustomDictionary.get(wordOri);
}
}
if (attribute != null && nature != null && attribute.hasNature(nature)) {
return attribute.getNatureFrequency(nature);
}
return attribute == null ? 0 : attribute.totalFrequency;
}
public boolean equals(Object obj) {
if (obj instanceof Term) {
Term term = (Term) obj;
if (this.nature == term.nature && this.word.equals(term.word)) {
return true;
}
}
return super.equals(obj);
}
}

View File

@@ -1,7 +1,9 @@
package com.tencent.supersonic.chat.core.config;
import com.tencent.supersonic.chat.core.utils.HanlpHelper;
import com.tencent.supersonic.headless.core.knowledge.helper.HanlpHelper;
import java.io.FileNotFoundException;
import lombok.Data;
import lombok.extern.slf4j.Slf4j;
import org.springframework.beans.factory.annotation.Value;
@@ -11,7 +13,7 @@ import org.springframework.context.annotation.Configuration;
@Data
@Configuration
@Slf4j
public class LocalFileConfig {
public class ChatLocalFileConfig {
@Value("${dict.directory.latest:/data/dictionary/custom}")

View File

@@ -1,6 +1,6 @@
package com.tencent.supersonic.chat.core.corrector;
import com.tencent.supersonic.chat.api.pojo.SchemaElement;
import com.tencent.supersonic.headless.api.pojo.SchemaElement;
import com.tencent.supersonic.chat.api.pojo.SemanticParseInfo;
import com.tencent.supersonic.chat.api.pojo.SemanticSchema;
import com.tencent.supersonic.chat.core.pojo.QueryContext;

View File

@@ -1,7 +1,7 @@
package com.tencent.supersonic.chat.core.corrector;
import com.tencent.supersonic.chat.api.pojo.SchemaElement;
import com.tencent.supersonic.chat.api.pojo.SchemaValueMap;
import com.tencent.supersonic.headless.api.pojo.SchemaElement;
import com.tencent.supersonic.headless.api.pojo.SchemaValueMap;
import com.tencent.supersonic.chat.api.pojo.SemanticParseInfo;
import com.tencent.supersonic.chat.api.pojo.SemanticSchema;
import com.tencent.supersonic.chat.api.pojo.request.QueryFilters;

View File

@@ -1,30 +0,0 @@
package com.tencent.supersonic.chat.core.knowledge;
import com.google.common.base.Objects;
import com.tencent.supersonic.chat.api.pojo.SchemaElement;
import lombok.Data;
import lombok.ToString;
@Data
@ToString
public class DatabaseMapResult extends MapResult {
private SchemaElement schemaElement;
@Override
public boolean equals(Object o) {
if (this == o) {
return true;
}
if (o == null || getClass() != o.getClass()) {
return false;
}
DatabaseMapResult that = (DatabaseMapResult) o;
return Objects.equal(name, that.name) && Objects.equal(schemaElement, that.schemaElement);
}
@Override
public int hashCode() {
return Objects.hashCode(name, schemaElement);
}
}

View File

@@ -1,13 +0,0 @@
package com.tencent.supersonic.chat.core.knowledge;
import java.util.List;
import lombok.Data;
@Data
public class DictConfig {
private Long modelId;
private List<DimValueInfo> dimValueInfoList;
}

View File

@@ -1,31 +0,0 @@
package com.tencent.supersonic.chat.core.knowledge;
public enum DictUpdateMode {
OFFLINE_FULL("OFFLINE_FULL"),
OFFLINE_MODEL("OFFLINE_MODEL"),
REALTIME_ADD("REALTIME_ADD"),
REALTIME_DELETE("REALTIME_DELETE"),
NOT_SUPPORT("NOT_SUPPORT");
private String value;
DictUpdateMode(String value) {
this.value = value;
}
public static DictUpdateMode of(String value) {
for (DictUpdateMode item : DictUpdateMode.values()) {
if (item.value.equalsIgnoreCase(value)) {
return item;
}
}
return DictUpdateMode.NOT_SUPPORT;
}
public String getValue() {
return value;
}
}

View File

@@ -1,34 +0,0 @@
package com.tencent.supersonic.chat.core.knowledge;
import java.util.Objects;
import lombok.Data;
import lombok.ToString;
/***
* word nature
*/
@Data
@ToString
public class DictWord {
private String word;
private String nature;
private String natureWithFrequency;
@Override
public boolean equals(Object o) {
if (this == o) {
return true;
}
if (o == null || getClass() != o.getClass()) {
return false;
}
DictWord that = (DictWord) o;
return Objects.equals(word, that.word) && Objects.equals(natureWithFrequency, that.natureWithFrequency);
}
@Override
public int hashCode() {
return Objects.hash(word, natureWithFrequency);
}
}

View File

@@ -1,38 +0,0 @@
package com.tencent.supersonic.chat.core.knowledge.dictionary;
import com.hankcs.hanlp.corpus.tag.Nature;
import com.hankcs.hanlp.dictionary.CoreDictionary;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
/**
* Dictionary Attribute Util
*/
public class DictionaryAttributeUtil {
public static CoreDictionary.Attribute getAttribute(CoreDictionary.Attribute old, CoreDictionary.Attribute add) {
Map<Nature, Integer> map = new HashMap<>();
IntStream.range(0, old.nature.length).boxed().forEach(i -> map.put(old.nature[i], old.frequency[i]));
IntStream.range(0, add.nature.length).boxed().forEach(i -> map.put(add.nature[i], add.frequency[i]));
List<Map.Entry<Nature, Integer>> list = new LinkedList<Map.Entry<Nature, Integer>>(map.entrySet());
Collections.sort(list, new Comparator<Map.Entry<Nature, Integer>>() {
public int compare(Map.Entry<Nature, Integer> o1, Map.Entry<Nature, Integer> o2) {
return o2.getValue() - o1.getValue();
}
});
CoreDictionary.Attribute attribute = new CoreDictionary.Attribute(
list.stream().map(i -> i.getKey()).collect(Collectors.toList()).toArray(new Nature[0]),
list.stream().map(i -> i.getValue()).mapToInt(Integer::intValue).toArray(),
list.stream().map(i -> i.getValue()).findFirst().get());
if (old.original != null || add.original != null) {
attribute.original = add.original != null ? add.original : old.original;
}
return attribute;
}
}

View File

@@ -1,18 +0,0 @@
package com.tencent.supersonic.chat.core.knowledge;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import lombok.Data;
@Data
public class DimValue2DictCommand {
private DictUpdateMode updateMode;
private List<Long> modelIds;
private Map<Long, List<Long>> modelAndDimPair = new HashMap<>();
}

View File

@@ -1,31 +0,0 @@
package com.tencent.supersonic.chat.core.knowledge;
import com.tencent.supersonic.common.pojo.enums.TaskStatusEnum;
import java.util.Date;
import java.util.Set;
import lombok.Data;
@Data
public class DimValueDictInfo {
private Long id;
private String name;
private String description;
private String command;
private TaskStatusEnum status;
private String createdBy;
private Date createdAt;
private Long elapsedMs;
private Set<Long> dimIds;
}

View File

@@ -1,26 +0,0 @@
package com.tencent.supersonic.chat.core.knowledge;
import com.tencent.supersonic.common.pojo.enums.TypeEnums;
import java.util.List;
import javax.validation.constraints.NotNull;
public class DimValueInfo {
/**
* metricId、DimensionId、domainId
*/
private Long itemId;
/**
* type: IntentionTypeEnum
* temporarily only supports dimension-related information
*/
@NotNull
private TypeEnums type = TypeEnums.DIMENSION;
private List<String> blackList;
private List<String> whiteList;
private List<String> ruleList;
private Boolean isDictInfo;
}

View File

@@ -1,34 +0,0 @@
package com.tencent.supersonic.chat.core.knowledge;
import com.google.common.base.Objects;
import java.util.Map;
import lombok.Data;
import lombok.ToString;
@Data
@ToString
public class EmbeddingResult extends MapResult {
private String id;
private double distance;
private Map<String, String> metadata;
@Override
public boolean equals(Object o) {
if (this == o) {
return true;
}
if (o == null || getClass() != o.getClass()) {
return false;
}
EmbeddingResult that = (EmbeddingResult) o;
return Objects.equal(id, that.id);
}
@Override
public int hashCode() {
return Objects.hashCode(id);
}
}

View File

@@ -1,56 +0,0 @@
package com.tencent.supersonic.chat.core.knowledge;
import java.util.List;
public interface FileHandler {
/**
* backup files to a specific directory
* config: dict.directory.backup
*
* @param fileName
*/
void backupFile(String fileName);
/**
* create a directory
*
* @param path
*/
void createDir(String path);
Boolean existPath(String path);
/**
* write data to a specific file,
* config dir: dict.directory.latest
*
* @param data
* @param fileName
* @param append
*/
void writeFile(List<String> data, String fileName, Boolean append);
/**
* get the knowledge file root directory
*
* @return
*/
String getDictRootPath();
/**
* delete dictionary file
* automatic backup
*
* @param fileName
* @return
*/
Boolean deleteDictFile(String fileName);
/**
* delete files directly without backup
*
* @param fileName
*/
void deleteFile(String fileName);
}

View File

@@ -1,32 +0,0 @@
package com.tencent.supersonic.chat.core.knowledge;
import com.hankcs.hanlp.corpus.io.IIOAdapter;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.URI;
import lombok.extern.slf4j.Slf4j;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
@Slf4j
public class HadoopFileIOAdapter implements IIOAdapter {
@Override
public InputStream open(String path) throws IOException {
log.info("open:{}", path);
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(URI.create(path), conf);
return fs.open(new Path(path));
}
@Override
public OutputStream create(String path) throws IOException {
log.info("create:{}", path);
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(URI.create(path), conf);
return fs.create(new Path(path));
}
}

View File

@@ -1,44 +0,0 @@
package com.tencent.supersonic.chat.core.knowledge;
import com.google.common.base.Objects;
import java.util.List;
import lombok.Data;
import lombok.ToString;
@Data
@ToString
public class HanlpMapResult extends MapResult {
private List<String> natures;
private int offset = 0;
private double similarity;
public HanlpMapResult(String name, List<String> natures, String detectWord) {
this.name = name;
this.natures = natures;
this.detectWord = detectWord;
}
@Override
public boolean equals(Object o) {
if (this == o) {
return true;
}
if (o == null || getClass() != o.getClass()) {
return false;
}
HanlpMapResult hanlpMapResult = (HanlpMapResult) o;
return Objects.equal(name, hanlpMapResult.name) && Objects.equal(natures, hanlpMapResult.natures);
}
@Override
public int hashCode() {
return Objects.hashCode(name, natures);
}
public void setOffset(int offset) {
this.offset = offset;
}
}

View File

@@ -1,55 +0,0 @@
package com.tencent.supersonic.chat.core.knowledge;
import com.tencent.supersonic.chat.core.utils.NatureHelper;
import lombok.Data;
import org.apache.commons.lang3.StringUtils;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Service;
import org.springframework.util.CollectionUtils;
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
import java.util.Set;
@Data
@Service
public class LoadRemoveService {
@Value("${mapper.remove.agentId:}")
private Integer mapperRemoveAgentId;
@Value("${mapper.remove.nature.prefix:}")
private String mapperRemoveNaturePrefix;
public List removeNatures(List value, Integer agentId, Set<Long> detectModelIds) {
if (CollectionUtils.isEmpty(value)) {
return value;
}
List<String> resultList = new ArrayList<>(value);
if (!CollectionUtils.isEmpty(detectModelIds)) {
resultList.removeIf(nature -> {
if (Objects.isNull(nature)) {
return false;
}
Long modelId = NatureHelper.getViewId(nature);
if (Objects.nonNull(modelId)) {
return !detectModelIds.contains(modelId);
}
return false;
});
}
if (Objects.nonNull(mapperRemoveAgentId)
&& mapperRemoveAgentId.equals(agentId)
&& StringUtils.isNotBlank(mapperRemoveNaturePrefix)) {
resultList.removeIf(nature -> {
if (Objects.isNull(nature)) {
return false;
}
return nature.startsWith(mapperRemoveNaturePrefix);
});
}
return resultList;
}
}

View File

@@ -1,127 +0,0 @@
package com.tencent.supersonic.chat.core.knowledge;
import com.tencent.supersonic.chat.core.config.LocalFileConfig;
import java.io.BufferedWriter;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.StandardCopyOption;
import java.nio.file.StandardOpenOption;
import java.util.List;
import lombok.extern.slf4j.Slf4j;
import org.springframework.stereotype.Component;
import org.springframework.util.CollectionUtils;
@Slf4j
@Component
public class LocalFileHandler implements FileHandler {
private final LocalFileConfig localFileConfig;
public LocalFileHandler(LocalFileConfig localFileConfig) {
this.localFileConfig = localFileConfig;
}
@Override
public void backupFile(String fileName) {
String dictDirectoryBackup = localFileConfig.getDictDirectoryBackup();
if (!existPath(dictDirectoryBackup)) {
createDir(dictDirectoryBackup);
}
String source = localFileConfig.getDictDirectoryLatest() + "/" + fileName;
String target = dictDirectoryBackup + "/" + fileName;
Path sourcePath = Paths.get(source);
Path targetPath = Paths.get(target);
try {
Files.copy(sourcePath, targetPath, StandardCopyOption.REPLACE_EXISTING);
log.info("backupFile successfully! path:{}", targetPath.toAbsolutePath());
} catch (IOException e) {
log.info("Failed to copy file: " + e.getMessage());
}
}
@Override
public void createDir(String directoryPath) {
Path path = Paths.get(directoryPath);
try {
Files.createDirectories(path);
log.info("Directory created successfully!");
} catch (IOException e) {
log.info("Failed to create directory: " + e.getMessage());
}
}
@Override
public void deleteFile(String filePath) {
Path path = Paths.get(filePath);
try {
Files.delete(path);
log.info("File:{} deleted successfully!", getAbsolutePath(filePath));
} catch (IOException e) {
log.warn("Failed to delete file:{}, e:", getAbsolutePath(filePath), e);
}
}
@Override
public Boolean existPath(String pathStr) {
Path path = Paths.get(pathStr);
if (Files.exists(path)) {
log.info("path:{} exists!", getAbsolutePath(pathStr));
return true;
} else {
log.info("path:{} not exists!", getAbsolutePath(pathStr));
}
return false;
}
@Override
public void writeFile(List<String> lines, String fileName, Boolean append) {
String dictDirectoryLatest = localFileConfig.getDictDirectoryLatest();
if (!existPath(dictDirectoryLatest)) {
createDir(dictDirectoryLatest);
}
String filePath = dictDirectoryLatest + "/" + fileName;
if (existPath(filePath)) {
backupFile(fileName);
}
try (BufferedWriter writer = getWriter(filePath, append)) {
if (!CollectionUtils.isEmpty(lines)) {
for (String line : lines) {
writer.write(line);
writer.newLine();
}
}
log.info("File:{} written successfully!", getAbsolutePath(filePath));
} catch (IOException e) {
log.info("Failed to write file:{}, e:", getAbsolutePath(filePath), e);
}
}
public String getAbsolutePath(String path) {
return Paths.get(path).toAbsolutePath().toString();
}
@Override
public String getDictRootPath() {
return Paths.get(localFileConfig.getDictDirectoryLatest()).toAbsolutePath().toString();
}
@Override
public Boolean deleteDictFile(String fileName) {
backupFile(fileName);
deleteFile(localFileConfig.getDictDirectoryLatest() + "/" + fileName);
return true;
}
private BufferedWriter getWriter(String filePath, Boolean append) throws IOException {
if (append) {
return Files.newBufferedWriter(Paths.get(filePath), StandardCharsets.UTF_8, StandardOpenOption.APPEND);
}
return Files.newBufferedWriter(Paths.get(filePath), StandardCharsets.UTF_8);
}
}

View File

@@ -1,13 +0,0 @@
package com.tencent.supersonic.chat.core.knowledge;
import java.io.Serializable;
import lombok.Data;
import lombok.ToString;
@Data
@ToString
public class MapResult implements Serializable {
protected String name;
protected String detectWord;
}

View File

@@ -1,396 +0,0 @@
package com.tencent.supersonic.chat.core.knowledge;
import static com.hankcs.hanlp.utility.Predefine.logger;
import com.hankcs.hanlp.HanLP;
import com.hankcs.hanlp.collection.trie.DoubleArrayTrie;
import com.hankcs.hanlp.collection.trie.bintrie.BinTrie;
import com.hankcs.hanlp.corpus.io.ByteArray;
import com.hankcs.hanlp.corpus.io.IOUtil;
import com.hankcs.hanlp.corpus.tag.Nature;
import com.hankcs.hanlp.dictionary.CoreDictionary;
import com.hankcs.hanlp.dictionary.DynamicCustomDictionary;
import com.hankcs.hanlp.dictionary.other.CharTable;
import com.hankcs.hanlp.seg.common.Term;
import com.hankcs.hanlp.utility.LexiconUtility;
import com.hankcs.hanlp.utility.Predefine;
import com.hankcs.hanlp.utility.TextUtility;
import com.tencent.supersonic.chat.core.knowledge.dictionary.DictionaryAttributeUtil;
import com.tencent.supersonic.chat.core.utils.HanlpHelper;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.Arrays;
import java.util.Comparator;
import java.util.LinkedHashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.PriorityQueue;
import java.util.TreeMap;
import java.util.concurrent.ConcurrentHashMap;
public class MultiCustomDictionary extends DynamicCustomDictionary {
public static int MAX_SIZE = 10;
public static Boolean removeDuplicates = true;
public static ConcurrentHashMap<String, PriorityQueue<Term>> NATURE_TO_VALUES = new ConcurrentHashMap<>();
private static boolean addToSuggesterTrie = true;
public MultiCustomDictionary() {
this(HanLP.Config.CustomDictionaryPath);
}
public MultiCustomDictionary(String... path) {
super(path);
}
/***
* load dictionary
* @param path
* @param defaultNature
* @param map
* @param customNatureCollector
* @param addToSuggeterTrie
* @return
*/
public static boolean load(String path, Nature defaultNature, TreeMap<String, CoreDictionary.Attribute> map,
LinkedHashSet<Nature> customNatureCollector, boolean addToSuggeterTrie) {
try {
String splitter = "\\s";
if (path.endsWith(".csv")) {
splitter = ",";
}
BufferedReader br = new BufferedReader(new InputStreamReader(IOUtil.newInputStream(path), "UTF-8"));
boolean firstLine = true;
while (true) {
String[] param;
do {
String line;
if ((line = br.readLine()) == null) {
br.close();
return true;
}
if (firstLine) {
line = IOUtil.removeUTF8BOM(line);
firstLine = false;
}
param = line.split(splitter);
} while (param[0].length() == 0);
if (HanLP.Config.Normalization) {
param[0] = CharTable.convert(param[0]);
}
int natureCount = (param.length - 1) / 2;
CoreDictionary.Attribute attribute;
boolean isLetters = isLetters(param[0]);
String original = null;
String word = getWordBySpace(param[0]);
if (isLetters) {
original = word;
word = word.toLowerCase();
}
if (natureCount == 0) {
attribute = new CoreDictionary.Attribute(defaultNature);
} else {
attribute = new CoreDictionary.Attribute(natureCount);
for (int i = 0; i < natureCount; ++i) {
attribute.nature[i] = LexiconUtility.convertStringToNature(param[1 + 2 * i],
customNatureCollector);
attribute.frequency[i] = Integer.parseInt(param[2 + 2 * i]);
attribute.totalFrequency += attribute.frequency[i];
}
}
attribute.original = original;
if (removeDuplicates && map.containsKey(word)) {
attribute = DictionaryAttributeUtil.getAttribute(map.get(word), attribute);
}
map.put(word, attribute);
if (addToSuggeterTrie) {
SearchService.put(word, attribute);
}
for (int i = 0; i < attribute.nature.length; i++) {
Nature nature = attribute.nature[i];
PriorityQueue<Term> priorityQueue = NATURE_TO_VALUES.get(nature.toString());
if (Objects.isNull(priorityQueue)) {
priorityQueue = new PriorityQueue<>(MAX_SIZE,
Comparator.comparingInt(Term::getFrequency).reversed());
NATURE_TO_VALUES.put(nature.toString(), priorityQueue);
}
Term term = new Term(word, nature);
term.setFrequency(attribute.frequency[i]);
if (!priorityQueue.contains(term) && priorityQueue.size() < MAX_SIZE) {
priorityQueue.add(term);
}
}
}
} catch (Exception var12) {
logger.severe("自定义词典" + path + "读取错误!" + var12);
return false;
}
}
public boolean load(String... path) {
this.path = path;
long start = System.currentTimeMillis();
if (!this.loadMainDictionary(path[0])) {
Predefine.logger.warning("自定义词典" + Arrays.toString(path) + "加载失败");
return false;
} else {
Predefine.logger.info(
"自定义词典加载成功:" + this.dat.size() + "个词条,耗时" + (System.currentTimeMillis() - start) + "ms");
this.path = path;
return true;
}
}
/***
* load main dictionary
* @param mainPath
* @param path
* @param dat
* @param isCache
* @param addToSuggestTrie
* @return
*/
public static boolean loadMainDictionary(String mainPath, String[] path,
DoubleArrayTrie<CoreDictionary.Attribute> dat, boolean isCache, boolean addToSuggestTrie) {
Predefine.logger.info("自定义词典开始加载:" + mainPath);
if (loadDat(mainPath, dat)) {
return true;
} else {
TreeMap<String, CoreDictionary.Attribute> map = new TreeMap();
LinkedHashSet customNatureCollector = new LinkedHashSet();
try {
for (String p : path) {
Nature defaultNature = Nature.n;
File file = new File(p);
String fileName = file.getName();
int cut = fileName.lastIndexOf(32);
if (cut > 0) {
String nature = fileName.substring(cut + 1);
p = file.getParent() + File.separator + fileName.substring(0, cut);
try {
defaultNature = LexiconUtility.convertStringToNature(nature, customNatureCollector);
} catch (Exception var16) {
Predefine.logger.severe("配置文件【" + p + "】写错了!" + var16);
continue;
}
}
Predefine.logger.info("以默认词性[" + defaultNature + "]加载自定义词典" + p + "中……");
boolean success = load(p, defaultNature, map, customNatureCollector, addToSuggestTrie);
if (!success) {
Predefine.logger.warning("失败:" + p);
}
}
if (map.size() == 0) {
Predefine.logger.warning("没有加载到任何词条");
map.put("未##它", null);
}
logger.info("正在构建DoubleArrayTrie……");
dat.build(map);
if (addToSuggestTrie) {
// SearchService.save();
}
if (isCache) {
// 缓存成dat文件下次加载会快很多
logger.info("正在缓存词典为dat文件……");
// 缓存值文件
List<CoreDictionary.Attribute> attributeList = new LinkedList<CoreDictionary.Attribute>();
for (Map.Entry<String, CoreDictionary.Attribute> entry : map.entrySet()) {
attributeList.add(entry.getValue());
}
DataOutputStream out = new DataOutputStream(
new BufferedOutputStream(IOUtil.newOutputStream(mainPath + ".bin")));
if (customNatureCollector.isEmpty()) {
for (int i = Nature.begin.ordinal() + 1; i < Nature.values().length; ++i) {
Nature nature = Nature.values()[i];
if (Objects.nonNull(nature)) {
customNatureCollector.add(nature);
}
}
}
IOUtil.writeCustomNature(out, customNatureCollector);
out.writeInt(attributeList.size());
for (CoreDictionary.Attribute attribute : attributeList) {
attribute.save(out);
}
dat.save(out);
out.close();
}
} catch (FileNotFoundException var17) {
logger.severe("自定义词典" + mainPath + "不存在!" + var17);
return false;
} catch (IOException var18) {
logger.severe("自定义词典" + mainPath + "读取错误!" + var18);
return false;
} catch (Exception var19) {
logger.warning("自定义词典" + mainPath + "缓存失败!\n" + TextUtility.exceptionToString(var19));
}
return true;
}
}
public boolean loadMainDictionary(String mainPath) {
return loadMainDictionary(mainPath, this.path, this.dat, true, addToSuggesterTrie);
}
public static boolean loadDat(String path, DoubleArrayTrie<CoreDictionary.Attribute> dat) {
return loadDat(path, HanLP.Config.CustomDictionaryPath, dat);
}
public static boolean loadDat(String path, String[] customDicPath, DoubleArrayTrie<CoreDictionary.Attribute> dat) {
try {
if (HanLP.Config.CustomDictionaryAutoRefreshCache && isDicNeedUpdate(path, customDicPath)) {
return false;
} else {
ByteArray byteArray = ByteArray.createByteArray(path + ".bin");
if (byteArray == null) {
return false;
} else {
int size = byteArray.nextInt();
if (size < 0) {
while (true) {
++size;
if (size > 0) {
size = byteArray.nextInt();
break;
}
Nature.create(byteArray.nextString());
}
}
CoreDictionary.Attribute[] attributes = new CoreDictionary.Attribute[size];
Nature[] natureIndexArray = Nature.values();
for (int i = 0; i < size; ++i) {
int currentTotalFrequency = byteArray.nextInt();
int length = byteArray.nextInt();
attributes[i] = new CoreDictionary.Attribute(length);
attributes[i].totalFrequency = currentTotalFrequency;
for (int j = 0; j < length; ++j) {
attributes[i].nature[j] = natureIndexArray[byteArray.nextInt()];
attributes[i].frequency[j] = byteArray.nextInt();
}
}
if (!dat.load(byteArray, attributes)) {
return false;
} else {
return true;
}
}
}
} catch (Exception var11) {
logger.warning("读取失败,问题发生在" + TextUtility.exceptionToString(var11));
return false;
}
}
public static boolean isLetters(String str) {
char[] chars = str.toCharArray();
if (chars.length <= 1) {
return false;
}
for (int i = 0; i < chars.length; i++) {
if ((chars[i] >= 'A' && chars[i] <= 'Z')) {
return true;
}
}
return false;
}
public static boolean isLowerLetter(String str) {
char[] chars = str.toCharArray();
for (int i = 0; i < chars.length; i++) {
if ((chars[i] >= 'a' && chars[i] <= 'z')) {
return true;
}
}
return false;
}
public static String getWordBySpace(String word) {
if (word.contains(HanlpHelper.SPACE_SPILT)) {
return word.replace(HanlpHelper.SPACE_SPILT, " ");
}
return word;
}
public boolean reload() {
if (this.path != null && this.path.length != 0) {
IOUtil.deleteFile(this.path[0] + ".bin");
Boolean loadCacheOk = this.loadDat(this.path[0], this.path, this.dat);
if (!loadCacheOk) {
return this.loadMainDictionary(this.path[0], this.path, this.dat, true, addToSuggesterTrie);
}
}
return false;
}
public synchronized boolean insert(String word, String natureWithFrequency) {
if (word == null) {
return false;
} else {
if (HanLP.Config.Normalization) {
word = CharTable.convert(word);
}
CoreDictionary.Attribute att = natureWithFrequency == null ? new CoreDictionary.Attribute(Nature.nz, 1)
: CoreDictionary.Attribute.create(natureWithFrequency);
boolean isLetters = isLetters(word);
word = getWordBySpace(word);
String original = null;
if (isLetters) {
original = word;
word = word.toLowerCase();
}
if (att == null) {
return false;
} else if (this.dat.containsKey(word)) {
att.original = original;
att = DictionaryAttributeUtil.getAttribute(this.dat.get(word), att);
this.dat.set(word, att);
// return true;
} else {
if (this.trie == null) {
this.trie = new BinTrie();
}
att.original = original;
if (this.trie.containsKey(word)) {
att = DictionaryAttributeUtil.getAttribute(this.trie.get(word), att);
}
this.trie.put(word, att);
// return true;
}
if (addToSuggesterTrie) {
SearchService.put(word, att);
}
return true;
}
}
}

View File

@@ -1,170 +0,0 @@
package com.tencent.supersonic.chat.core.knowledge;
import com.hankcs.hanlp.collection.trie.bintrie.BaseNode;
import com.hankcs.hanlp.collection.trie.bintrie.BinTrie;
import com.hankcs.hanlp.corpus.tag.Nature;
import com.hankcs.hanlp.dictionary.CoreDictionary;
import com.hankcs.hanlp.seg.common.Term;
import com.tencent.supersonic.chat.api.pojo.request.DimensionValueReq;
import com.tencent.supersonic.chat.core.knowledge.dictionary.DictionaryAttributeUtil;
import com.tencent.supersonic.common.pojo.enums.DictWordType;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.PriorityQueue;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.stream.Collectors;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import org.springframework.util.CollectionUtils;
@Slf4j
public class SearchService {
public static final int SEARCH_SIZE = 200;
private static BinTrie<List<String>> trie;
private static BinTrie<List<String>> suffixTrie;
static {
trie = new BinTrie<>();
suffixTrie = new BinTrie<>();
}
/***
* prefix Search
* @param key
* @return
*/
public static List<HanlpMapResult> prefixSearch(String key, int limit, Integer agentId, Set<Long> detectModelIds) {
return prefixSearch(key, limit, agentId, trie, detectModelIds);
}
public static List<HanlpMapResult> prefixSearch(String key, int limit, Integer agentId,
BinTrie<List<String>> binTrie, Set<Long> detectModelIds) {
Set<Map.Entry<String, List<String>>> result = prefixSearchLimit(key, limit, binTrie, agentId, detectModelIds);
return result.stream().map(
entry -> {
String name = entry.getKey().replace("#", " ");
return new HanlpMapResult(name, entry.getValue(), key);
}
).sorted((a, b) -> -(b.getName().length() - a.getName().length()))
.limit(SEARCH_SIZE)
.collect(Collectors.toList());
}
/***
* suffix Search
* @param key
* @return
*/
public static List<HanlpMapResult> suffixSearch(String key, int limit, Integer agentId, Set<Long> detectModelIds) {
String reverseDetectSegment = StringUtils.reverse(key);
return suffixSearch(reverseDetectSegment, limit, agentId, suffixTrie, detectModelIds);
}
public static List<HanlpMapResult> suffixSearch(String key, int limit, Integer agentId,
BinTrie<List<String>> binTrie, Set<Long> detectModelIds) {
Set<Map.Entry<String, List<String>>> result = prefixSearchLimit(key, limit, binTrie, agentId, detectModelIds);
return result.stream().map(
entry -> {
String name = entry.getKey().replace("#", " ");
List<String> natures = entry.getValue().stream()
.map(nature -> nature.replaceAll(DictWordType.SUFFIX.getType(), ""))
.collect(Collectors.toList());
name = StringUtils.reverse(name);
return new HanlpMapResult(name, natures, key);
}
).sorted((a, b) -> -(b.getName().length() - a.getName().length()))
.limit(SEARCH_SIZE)
.collect(Collectors.toList());
}
private static Set<Map.Entry<String, List<String>>> prefixSearchLimit(String key, int limit,
BinTrie<List<String>> binTrie, Integer agentId, Set<Long> detectModelIds) {
key = key.toLowerCase();
Set<Map.Entry<String, List<String>>> entrySet = new TreeSet<Map.Entry<String, List<String>>>();
StringBuilder sb = new StringBuilder();
if (StringUtils.isNotBlank(key)) {
sb = new StringBuilder(key.substring(0, key.length() - 1));
}
BaseNode branch = binTrie;
char[] chars = key.toCharArray();
for (char aChar : chars) {
if (branch == null) {
return entrySet;
}
branch = branch.getChild(aChar);
}
if (branch == null) {
return entrySet;
}
branch.walkLimit(sb, entrySet, limit, agentId, detectModelIds);
return entrySet;
}
public static void clear() {
log.info("clear all trie");
trie = new BinTrie<>();
suffixTrie = new BinTrie<>();
}
public static void put(String key, CoreDictionary.Attribute attribute) {
trie.put(key, getValue(attribute.nature));
}
public static void loadSuffix(List<DictWord> suffixes) {
if (CollectionUtils.isEmpty(suffixes)) {
return;
}
TreeMap<String, CoreDictionary.Attribute> map = new TreeMap();
for (DictWord suffix : suffixes) {
CoreDictionary.Attribute attributeNew = suffix.getNatureWithFrequency() == null
? new CoreDictionary.Attribute(Nature.nz, 1)
: CoreDictionary.Attribute.create(suffix.getNatureWithFrequency());
if (map.containsKey(suffix.getWord())) {
attributeNew = DictionaryAttributeUtil.getAttribute(map.get(suffix.getWord()), attributeNew);
}
map.put(suffix.getWord(), attributeNew);
}
for (Map.Entry<String, CoreDictionary.Attribute> stringAttributeEntry : map.entrySet()) {
putSuffix(stringAttributeEntry.getKey(), stringAttributeEntry.getValue());
}
}
public static void putSuffix(String key, CoreDictionary.Attribute attribute) {
Nature[] nature = attribute.nature;
suffixTrie.put(key, getValue(nature));
}
private static List<String> getValue(Nature[] nature) {
return Arrays.stream(nature).map(entry -> entry.toString()).collect(Collectors.toList());
}
public static void remove(DictWord dictWord, Nature[] natures) {
trie.remove(dictWord.getWord());
if (Objects.nonNull(natures) && natures.length > 0) {
trie.put(dictWord.getWord(), getValue(natures));
}
if (dictWord.getNature().contains(DictWordType.METRIC.getType()) || dictWord.getNature()
.contains(DictWordType.DIMENSION.getType())) {
suffixTrie.remove(dictWord.getWord());
}
}
public static List<String> getDimensionValue(DimensionValueReq dimensionValueReq) {
String nature = DictWordType.NATURE_SPILT + dimensionValueReq.getModelId() + DictWordType.NATURE_SPILT
+ dimensionValueReq.getElementID();
PriorityQueue<Term> terms = MultiCustomDictionary.NATURE_TO_VALUES.get(nature);
if (org.apache.commons.collections.CollectionUtils.isEmpty(terms)) {
return new ArrayList<>();
}
return terms.stream().map(term -> term.getWord()).collect(Collectors.toList());
}
}

View File

@@ -1,22 +0,0 @@
package com.tencent.supersonic.chat.core.knowledge;
import lombok.Builder;
import lombok.Data;
import lombok.ToString;
import java.io.Serializable;
@Data
@ToString
@Builder
public class ViewInfoStat implements Serializable {
private long viewCount;
private long metricViewCount;
private long dimensionViewCount;
private long dimensionValueViewCount;
}

View File

@@ -1,38 +0,0 @@
package com.tencent.supersonic.chat.core.knowledge.builder;
import com.tencent.supersonic.chat.api.pojo.SchemaElement;
import com.tencent.supersonic.chat.core.knowledge.DictWord;
import java.util.ArrayList;
import java.util.List;
import lombok.extern.slf4j.Slf4j;
/**
* base word nature
*/
@Slf4j
public abstract class BaseWordBuilder {
public static final Long DEFAULT_FREQUENCY = 100000L;
public List<DictWord> getDictWords(List<SchemaElement> schemaElements) {
List<DictWord> dictWords = new ArrayList<>();
try {
dictWords = getDictWordsWithException(schemaElements);
} catch (Exception e) {
log.error("getWordNatureList error,", e);
}
return dictWords;
}
protected List<DictWord> getDictWordsWithException(List<SchemaElement> schemaElements) {
List<DictWord> dictWords = new ArrayList<>();
for (SchemaElement schemaElement : schemaElements) {
dictWords.addAll(doGet(schemaElement.getName(), schemaElement));
}
return dictWords;
}
protected abstract List<DictWord> doGet(String word, SchemaElement schemaElement);
}

View File

@@ -1,64 +0,0 @@
package com.tencent.supersonic.chat.core.knowledge.builder;
import com.google.common.collect.Lists;
import com.tencent.supersonic.chat.api.pojo.SchemaElement;
import com.tencent.supersonic.chat.core.knowledge.DictWord;
import com.tencent.supersonic.common.pojo.enums.DictWordType;
import org.apache.commons.lang3.StringUtils;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Service;
import org.springframework.util.CollectionUtils;
import java.util.ArrayList;
import java.util.List;
/**
* dimension word nature
*/
@Service
public class DimensionWordBuilder extends BaseWordBuilder {
@Value("${nlp.dimension.use.suffix:true}")
private boolean nlpDimensionUseSuffix = true;
@Override
public List<DictWord> doGet(String word, SchemaElement schemaElement) {
List<DictWord> result = Lists.newArrayList();
result.add(getOnwWordNature(word, schemaElement, false));
result.addAll(getOnwWordNatureAlias(schemaElement, false));
if (nlpDimensionUseSuffix) {
String reverseWord = StringUtils.reverse(word);
if (StringUtils.isNotEmpty(word) && !word.equalsIgnoreCase(reverseWord)) {
result.add(getOnwWordNature(reverseWord, schemaElement, true));
}
}
return result;
}
private DictWord getOnwWordNature(String word, SchemaElement schemaElement, boolean isSuffix) {
DictWord dictWord = new DictWord();
dictWord.setWord(word);
Long viewId = schemaElement.getView();
String nature = DictWordType.NATURE_SPILT + viewId + DictWordType.NATURE_SPILT + schemaElement.getId()
+ DictWordType.DIMENSION.getType();
if (isSuffix) {
nature = DictWordType.NATURE_SPILT + viewId + DictWordType.NATURE_SPILT + schemaElement.getId()
+ DictWordType.SUFFIX.getType() + DictWordType.DIMENSION.getType();
}
dictWord.setNatureWithFrequency(String.format("%s " + DEFAULT_FREQUENCY, nature));
return dictWord;
}
private List<DictWord> getOnwWordNatureAlias(SchemaElement schemaElement, boolean isSuffix) {
List<DictWord> dictWords = new ArrayList<>();
if (CollectionUtils.isEmpty(schemaElement.getAlias())) {
return dictWords;
}
for (String alias : schemaElement.getAlias()) {
dictWords.add(getOnwWordNature(alias, schemaElement, false));
}
return dictWords;
}
}

View File

@@ -1,45 +0,0 @@
package com.tencent.supersonic.chat.core.knowledge.builder;
import com.google.common.collect.Lists;
import com.tencent.supersonic.chat.api.pojo.SchemaElement;
import com.tencent.supersonic.chat.core.knowledge.DictWord;
import com.tencent.supersonic.common.pojo.enums.DictWordType;
import lombok.extern.slf4j.Slf4j;
import org.springframework.stereotype.Service;
import org.springframework.util.CollectionUtils;
import java.util.List;
import java.util.Objects;
/**
* dimension value wordNature
*/
@Service
@Slf4j
public class EntityWordBuilder extends BaseWordBuilder {
@Override
public List<DictWord> doGet(String word, SchemaElement schemaElement) {
List<DictWord> result = Lists.newArrayList();
if (Objects.isNull(schemaElement)) {
return result;
}
Long view = schemaElement.getView();
String nature = DictWordType.NATURE_SPILT + view + DictWordType.NATURE_SPILT + schemaElement.getId()
+ DictWordType.ENTITY.getType();
if (!CollectionUtils.isEmpty(schemaElement.getAlias())) {
schemaElement.getAlias().stream().forEach(alias -> {
DictWord dictWordAlias = new DictWord();
dictWordAlias.setWord(alias);
dictWordAlias.setNatureWithFrequency(String.format("%s " + DEFAULT_FREQUENCY * 2, nature));
result.add(dictWordAlias);
});
}
return result;
}
}

View File

@@ -1,64 +0,0 @@
package com.tencent.supersonic.chat.core.knowledge.builder;
import com.google.common.collect.Lists;
import com.tencent.supersonic.chat.api.pojo.SchemaElement;
import com.tencent.supersonic.chat.core.knowledge.DictWord;
import com.tencent.supersonic.common.pojo.enums.DictWordType;
import org.apache.commons.lang3.StringUtils;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Service;
import org.springframework.util.CollectionUtils;
import java.util.ArrayList;
import java.util.List;
/**
* Metric DictWord
*/
@Service
public class MetricWordBuilder extends BaseWordBuilder {
@Value("${nlp.metric.use.suffix:true}")
private boolean nlpMetricUseSuffix = true;
@Override
public List<DictWord> doGet(String word, SchemaElement schemaElement) {
List<DictWord> result = Lists.newArrayList();
result.add(getOnwWordNature(word, schemaElement, false));
result.addAll(getOnwWordNatureAlias(schemaElement, false));
if (nlpMetricUseSuffix) {
String reverseWord = StringUtils.reverse(word);
if (!word.equalsIgnoreCase(reverseWord)) {
result.add(getOnwWordNature(reverseWord, schemaElement, true));
}
}
return result;
}
private DictWord getOnwWordNature(String word, SchemaElement schemaElement, boolean isSuffix) {
DictWord dictWord = new DictWord();
dictWord.setWord(word);
Long viewId = schemaElement.getView();
String nature = DictWordType.NATURE_SPILT + viewId + DictWordType.NATURE_SPILT + schemaElement.getId()
+ DictWordType.METRIC.getType();
if (isSuffix) {
nature = DictWordType.NATURE_SPILT + viewId + DictWordType.NATURE_SPILT + schemaElement.getId()
+ DictWordType.SUFFIX.getType() + DictWordType.METRIC.getType();
}
dictWord.setNatureWithFrequency(String.format("%s " + DEFAULT_FREQUENCY, nature));
return dictWord;
}
private List<DictWord> getOnwWordNatureAlias(SchemaElement schemaElement, boolean isSuffix) {
List<DictWord> dictWords = new ArrayList<>();
if (CollectionUtils.isEmpty(schemaElement.getAlias())) {
return dictWords;
}
for (String alias : schemaElement.getAlias()) {
dictWords.add(getOnwWordNature(alias, schemaElement, false));
}
return dictWords;
}
}

View File

@@ -1,44 +0,0 @@
package com.tencent.supersonic.chat.core.knowledge.builder;
import com.google.common.collect.Lists;
import com.tencent.supersonic.chat.api.pojo.SchemaElement;
import com.tencent.supersonic.chat.core.knowledge.DictWord;
import com.tencent.supersonic.common.pojo.enums.DictWordType;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.collections.CollectionUtils;
import org.springframework.stereotype.Service;
import java.util.List;
/**
* model word nature
*/
@Service
@Slf4j
public class ModelWordBuilder extends BaseWordBuilder {
@Override
public List<DictWord> doGet(String word, SchemaElement schemaElement) {
List<DictWord> result = Lists.newArrayList();
//modelName
DictWord dictWord = buildDictWord(word, schemaElement.getView());
result.add(dictWord);
//alias
List<String> aliasList = schemaElement.getAlias();
if (CollectionUtils.isNotEmpty(aliasList)) {
for (String alias : aliasList) {
result.add(buildDictWord(alias, schemaElement.getView()));
}
}
return result;
}
private DictWord buildDictWord(String word, Long modelId) {
DictWord dictWord = new DictWord();
dictWord.setWord(word);
String nature = DictWordType.NATURE_SPILT + modelId;
dictWord.setNatureWithFrequency(String.format("%s " + DEFAULT_FREQUENCY, nature));
return dictWord;
}
}

View File

@@ -1,41 +0,0 @@
package com.tencent.supersonic.chat.core.knowledge.builder;
import com.google.common.collect.Lists;
import com.tencent.supersonic.chat.api.pojo.SchemaElement;
import com.tencent.supersonic.chat.core.knowledge.DictWord;
import com.tencent.supersonic.common.pojo.enums.DictWordType;
import lombok.extern.slf4j.Slf4j;
import org.springframework.stereotype.Service;
import org.springframework.util.CollectionUtils;
import java.util.List;
import java.util.Objects;
/**
* dimension value wordNature
*/
@Service
@Slf4j
public class ValueWordBuilder extends BaseWordBuilder {
@Override
public List<DictWord> doGet(String word, SchemaElement schemaElement) {
List<DictWord> result = Lists.newArrayList();
if (Objects.nonNull(schemaElement) && !CollectionUtils.isEmpty(schemaElement.getAlias())) {
schemaElement.getAlias().stream().forEach(value -> {
DictWord dictWord = new DictWord();
Long viewId = schemaElement.getView();
String nature = DictWordType.NATURE_SPILT + viewId + DictWordType.NATURE_SPILT + schemaElement.getId();
dictWord.setNatureWithFrequency(String.format("%s " + DEFAULT_FREQUENCY, nature));
dictWord.setWord(value);
result.add(dictWord);
});
}
log.debug("ValueWordBuilder, result:{}", result);
return result;
}
}

View File

@@ -1,26 +0,0 @@
package com.tencent.supersonic.chat.core.knowledge.builder;
import com.tencent.supersonic.common.pojo.enums.DictWordType;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
/**
* DictWord Strategy Factory
*/
public class WordBuilderFactory {
private static Map<DictWordType, BaseWordBuilder> wordNatures = new ConcurrentHashMap<>();
static {
wordNatures.put(DictWordType.DIMENSION, new DimensionWordBuilder());
wordNatures.put(DictWordType.METRIC, new MetricWordBuilder());
wordNatures.put(DictWordType.VIEW, new ModelWordBuilder());
wordNatures.put(DictWordType.ENTITY, new EntityWordBuilder());
wordNatures.put(DictWordType.VALUE, new ValueWordBuilder());
}
public static BaseWordBuilder get(DictWordType strategyType) {
return wordNatures.get(strategyType);
}
}

View File

@@ -1,8 +1,8 @@
package com.tencent.supersonic.chat.core.mapper;
import com.tencent.supersonic.chat.api.pojo.SchemaElement;
import com.tencent.supersonic.headless.api.pojo.SchemaElement;
import com.tencent.supersonic.chat.api.pojo.SchemaElementMatch;
import com.tencent.supersonic.chat.api.pojo.SchemaElementType;
import com.tencent.supersonic.headless.api.pojo.SchemaElementType;
import com.tencent.supersonic.chat.api.pojo.SchemaMapInfo;
import com.tencent.supersonic.chat.api.pojo.SemanticSchema;
import com.tencent.supersonic.chat.api.pojo.ViewSchema;

View File

@@ -1,8 +1,8 @@
package com.tencent.supersonic.chat.core.mapper;
import com.hankcs.hanlp.seg.common.Term;
import com.tencent.supersonic.chat.core.pojo.QueryContext;
import com.tencent.supersonic.chat.core.utils.NatureHelper;
import com.tencent.supersonic.headless.api.pojo.response.S2Term;
import com.tencent.supersonic.headless.core.knowledge.helper.NatureHelper;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.collections4.CollectionUtils;
import org.apache.commons.lang3.StringUtils;
@@ -28,7 +28,8 @@ public abstract class BaseMatchStrategy<T> implements MatchStrategy<T> {
private MapperHelper mapperHelper;
@Override
public Map<MatchText, List<T>> match(QueryContext queryContext, List<Term> terms, Set<Long> detectViewIds) {
public Map<MatchText, List<T>> match(QueryContext queryContext, List<S2Term> terms,
Set<Long> detectViewIds) {
String text = queryContext.getQueryText();
if (Objects.isNull(terms) || StringUtils.isEmpty(text)) {
return null;
@@ -43,7 +44,7 @@ public abstract class BaseMatchStrategy<T> implements MatchStrategy<T> {
return result;
}
public List<T> detect(QueryContext queryContext, List<Term> terms, Set<Long> detectModelIds) {
public List<T> detect(QueryContext queryContext, List<S2Term> terms, Set<Long> detectModelIds) {
Map<Integer, Integer> regOffsetToLength = getRegOffsetToLength(terms);
String text = queryContext.getQueryText();
Set<T> results = new HashSet<>();
@@ -72,9 +73,10 @@ public abstract class BaseMatchStrategy<T> implements MatchStrategy<T> {
return;
}
public Map<Integer, Integer> getRegOffsetToLength(List<Term> terms) {
return terms.stream().sorted(Comparator.comparing(Term::length))
.collect(Collectors.toMap(Term::getOffset, term -> term.word.length(), (value1, value2) -> value2));
public Map<Integer, Integer> getRegOffsetToLength(List<S2Term> terms) {
return terms.stream().sorted(Comparator.comparing(S2Term::length))
.collect(Collectors.toMap(S2Term::getOffset, term -> term.word.length(),
(value1, value2) -> value2));
}
public void selectResultInOneRound(Set<T> existResults, List<T> oneRoundResults) {
@@ -102,7 +104,7 @@ public abstract class BaseMatchStrategy<T> implements MatchStrategy<T> {
}
}
public List<T> getMatches(QueryContext queryContext, List<Term> terms) {
public List<T> getMatches(QueryContext queryContext, List<S2Term> terms) {
Set<Long> viewIds = mapperHelper.getViewIds(queryContext.getViewId(), queryContext.getAgent());
terms = filterByViewId(terms, viewIds);
Map<MatchText, List<T>> matchResult = match(queryContext, terms, viewIds);
@@ -120,7 +122,7 @@ public abstract class BaseMatchStrategy<T> implements MatchStrategy<T> {
return matches;
}
public List<Term> filterByViewId(List<Term> terms, Set<Long> viewIds) {
public List<S2Term> filterByViewId(List<S2Term> terms, Set<Long> viewIds) {
logTerms(terms);
if (CollectionUtils.isNotEmpty(viewIds)) {
terms = terms.stream().filter(term -> {
@@ -136,11 +138,11 @@ public abstract class BaseMatchStrategy<T> implements MatchStrategy<T> {
return terms;
}
public void logTerms(List<Term> terms) {
public void logTerms(List<S2Term> terms) {
if (CollectionUtils.isEmpty(terms)) {
return;
}
for (Term term : terms) {
for (S2Term term : terms) {
log.debug("word:{},nature:{},frequency:{}", term.word, term.nature.toString(), term.getFrequency());
}
}

View File

@@ -1,10 +1,10 @@
package com.tencent.supersonic.chat.core.mapper;
import com.hankcs.hanlp.seg.common.Term;
import com.tencent.supersonic.chat.api.pojo.SchemaElement;
import com.tencent.supersonic.headless.api.pojo.SchemaElement;
import com.tencent.supersonic.chat.api.pojo.SchemaElementMatch;
import com.tencent.supersonic.chat.core.config.OptimizationConfig;
import com.tencent.supersonic.chat.core.knowledge.DatabaseMapResult;
import com.tencent.supersonic.headless.api.pojo.response.S2Term;
import com.tencent.supersonic.headless.core.knowledge.DatabaseMapResult;
import com.tencent.supersonic.chat.core.pojo.QueryContext;
import com.tencent.supersonic.common.pojo.Constants;
import lombok.extern.slf4j.Slf4j;
@@ -36,7 +36,7 @@ public class DatabaseMatchStrategy extends BaseMatchStrategy<DatabaseMapResult>
private List<SchemaElement> allElements;
@Override
public Map<MatchText, List<DatabaseMapResult>> match(QueryContext queryContext, List<Term> terms,
public Map<MatchText, List<DatabaseMapResult>> match(QueryContext queryContext, List<S2Term> terms,
Set<Long> detectModelIds) {
this.allElements = getSchemaElements(queryContext);
return super.match(queryContext, terms, detectModelIds);

View File

@@ -1,13 +1,13 @@
package com.tencent.supersonic.chat.core.mapper;
import com.alibaba.fastjson.JSONObject;
import com.hankcs.hanlp.seg.common.Term;
import com.tencent.supersonic.chat.core.pojo.QueryContext;
import com.tencent.supersonic.chat.api.pojo.SchemaElement;
import com.tencent.supersonic.headless.api.pojo.SchemaElement;
import com.tencent.supersonic.chat.api.pojo.SchemaElementMatch;
import com.tencent.supersonic.chat.core.knowledge.EmbeddingResult;
import com.tencent.supersonic.chat.core.knowledge.builder.BaseWordBuilder;
import com.tencent.supersonic.chat.core.utils.HanlpHelper;
import com.tencent.supersonic.headless.api.pojo.response.S2Term;
import com.tencent.supersonic.headless.core.knowledge.EmbeddingResult;
import com.tencent.supersonic.headless.core.knowledge.builder.BaseWordBuilder;
import com.tencent.supersonic.headless.core.knowledge.helper.HanlpHelper;
import com.tencent.supersonic.common.util.ContextUtils;
import com.tencent.supersonic.common.util.embedding.Retrieval;
import java.util.List;
@@ -26,7 +26,7 @@ public class EmbeddingMapper extends BaseMapper {
public void doMap(QueryContext queryContext) {
//1. query from embedding by queryText
String queryText = queryContext.getQueryText();
List<Term> terms = HanlpHelper.getTerms(queryText);
List<S2Term> terms = HanlpHelper.getTerms(queryText);
EmbeddingMatchStrategy matchStrategy = ContextUtils.getBean(EmbeddingMatchStrategy.class);
List<EmbeddingResult> matchResults = matchStrategy.getMatches(queryContext, terms);

View File

@@ -2,7 +2,7 @@ package com.tencent.supersonic.chat.core.mapper;
import com.google.common.collect.Lists;
import com.tencent.supersonic.chat.core.config.OptimizationConfig;
import com.tencent.supersonic.chat.core.knowledge.EmbeddingResult;
import com.tencent.supersonic.headless.core.knowledge.EmbeddingResult;
import com.tencent.supersonic.chat.core.pojo.QueryContext;
import com.tencent.supersonic.common.config.EmbeddingConfig;
import com.tencent.supersonic.common.pojo.Constants;

View File

@@ -1,9 +1,9 @@
package com.tencent.supersonic.chat.core.mapper;
import com.tencent.supersonic.chat.api.pojo.SchemaElement;
import com.tencent.supersonic.headless.api.pojo.SchemaElement;
import com.tencent.supersonic.chat.api.pojo.SchemaElementMatch;
import com.tencent.supersonic.chat.api.pojo.SchemaElementType;
import com.tencent.supersonic.headless.api.pojo.SchemaElementType;
import com.tencent.supersonic.chat.api.pojo.SchemaMapInfo;
import com.tencent.supersonic.chat.api.pojo.SemanticSchema;
import com.tencent.supersonic.chat.api.pojo.ViewSchema;

View File

@@ -1,9 +1,9 @@
package com.tencent.supersonic.chat.core.mapper;
import com.hankcs.hanlp.seg.common.Term;
import com.tencent.supersonic.chat.core.config.OptimizationConfig;
import com.tencent.supersonic.chat.core.knowledge.HanlpMapResult;
import com.tencent.supersonic.chat.core.knowledge.SearchService;
import com.tencent.supersonic.headless.api.pojo.response.S2Term;
import com.tencent.supersonic.headless.core.knowledge.HanlpMapResult;
import com.tencent.supersonic.headless.core.knowledge.SearchService;
import com.tencent.supersonic.chat.core.pojo.QueryContext;
import com.tencent.supersonic.common.pojo.Constants;
import java.util.HashMap;
@@ -35,7 +35,7 @@ public class HanlpDictMatchStrategy extends BaseMatchStrategy<HanlpMapResult> {
private OptimizationConfig optimizationConfig;
@Override
public Map<MatchText, List<HanlpMapResult>> match(QueryContext queryContext, List<Term> terms,
public Map<MatchText, List<HanlpMapResult>> match(QueryContext queryContext, List<S2Term> terms,
Set<Long> detectModelIds) {
String text = queryContext.getQueryText();
if (Objects.isNull(terms) || StringUtils.isEmpty(text)) {
@@ -60,16 +60,15 @@ public class HanlpDictMatchStrategy extends BaseMatchStrategy<HanlpMapResult> {
public void detectByStep(QueryContext queryContext, Set<HanlpMapResult> existResults, Set<Long> detectModelIds,
Integer startIndex, Integer index, int offset) {
String text = queryContext.getQueryText();
Integer agentId = queryContext.getAgentId();
String detectSegment = text.substring(startIndex, index);
// step1. pre search
Integer oneDetectionMaxSize = optimizationConfig.getOneDetectionMaxSize();
LinkedHashSet<HanlpMapResult> hanlpMapResults = SearchService.prefixSearch(detectSegment, oneDetectionMaxSize,
agentId, detectModelIds).stream().collect(Collectors.toCollection(LinkedHashSet::new));
detectModelIds).stream().collect(Collectors.toCollection(LinkedHashSet::new));
// step2. suffix search
LinkedHashSet<HanlpMapResult> suffixHanlpMapResults = SearchService.suffixSearch(detectSegment,
oneDetectionMaxSize, agentId, detectModelIds).stream()
oneDetectionMaxSize, detectModelIds).stream()
.collect(Collectors.toCollection(LinkedHashSet::new));
hanlpMapResults.addAll(suffixHanlpMapResults);

View File

@@ -1,15 +1,16 @@
package com.tencent.supersonic.chat.core.mapper;
import com.hankcs.hanlp.seg.common.Term;
import com.tencent.supersonic.chat.api.pojo.SchemaElement;
import com.tencent.supersonic.headless.api.pojo.SchemaElement;
import com.tencent.supersonic.chat.api.pojo.SchemaElementMatch;
import com.tencent.supersonic.chat.api.pojo.SchemaElementType;
import com.tencent.supersonic.headless.api.pojo.SchemaElementType;
import com.tencent.supersonic.chat.api.pojo.SchemaMapInfo;
import com.tencent.supersonic.chat.core.knowledge.DatabaseMapResult;
import com.tencent.supersonic.chat.core.knowledge.HanlpMapResult;
import com.tencent.supersonic.headless.api.pojo.response.S2Term;
import com.tencent.supersonic.headless.core.knowledge.DatabaseMapResult;
import com.tencent.supersonic.headless.core.knowledge.HanlpMapResult;
import com.tencent.supersonic.chat.core.pojo.QueryContext;
import com.tencent.supersonic.chat.core.utils.HanlpHelper;
import com.tencent.supersonic.chat.core.utils.NatureHelper;
import com.tencent.supersonic.headless.server.service.KnowledgeService;
import com.tencent.supersonic.headless.core.knowledge.helper.HanlpHelper;
import com.tencent.supersonic.headless.core.knowledge.helper.NatureHelper;
import com.tencent.supersonic.common.util.ContextUtils;
import lombok.extern.slf4j.Slf4j;
import org.springframework.util.CollectionUtils;
@@ -32,7 +33,8 @@ public class KeywordMapper extends BaseMapper {
public void doMap(QueryContext queryContext) {
String queryText = queryContext.getQueryText();
//1.hanlpDict Match
List<Term> terms = HanlpHelper.getTerms(queryText);
KnowledgeService knowledgeService = ContextUtils.getBean(KnowledgeService.class);
List<S2Term> terms = knowledgeService.getTerms(queryText);
HanlpDictMatchStrategy hanlpMatchStrategy = ContextUtils.getBean(HanlpDictMatchStrategy.class);
List<HanlpMapResult> hanlpMapResults = hanlpMatchStrategy.getMatches(queryContext, terms);
@@ -46,7 +48,7 @@ public class KeywordMapper extends BaseMapper {
}
private void convertHanlpMapResultToMapInfo(List<HanlpMapResult> mapResults, QueryContext queryContext,
List<Term> terms) {
List<S2Term> terms) {
if (CollectionUtils.isEmpty(mapResults)) {
return;
}

View File

@@ -1,10 +1,10 @@
package com.tencent.supersonic.chat.core.mapper;
import com.hankcs.hanlp.algorithm.EditDistance;
import com.hankcs.hanlp.seg.common.Term;
import com.tencent.supersonic.chat.core.agent.Agent;
import com.tencent.supersonic.chat.core.config.OptimizationConfig;
import com.tencent.supersonic.chat.core.utils.NatureHelper;
import com.tencent.supersonic.headless.api.pojo.response.S2Term;
import com.tencent.supersonic.headless.core.knowledge.helper.NatureHelper;
import lombok.Data;
import lombok.extern.slf4j.Slf4j;
import org.springframework.beans.factory.annotation.Autowired;
@@ -36,8 +36,8 @@ public class MapperHelper {
return index;
}
public Integer getStepOffset(List<Term> termList, Integer index) {
List<Integer> offsetList = termList.stream().sorted(Comparator.comparing(Term::getOffset))
public Integer getStepOffset(List<S2Term> termList, Integer index) {
List<Integer> offsetList = termList.stream().sorted(Comparator.comparing(S2Term::getOffset))
.map(term -> term.getOffset()).collect(Collectors.toList());
for (int j = 0; j < termList.size() - 1; j++) {

View File

@@ -1,7 +1,8 @@
package com.tencent.supersonic.chat.core.mapper;
import com.hankcs.hanlp.seg.common.Term;
import com.tencent.supersonic.chat.core.pojo.QueryContext;
import com.tencent.supersonic.headless.api.pojo.response.S2Term;
import java.util.List;
import java.util.Map;
import java.util.Set;
@@ -12,6 +13,6 @@ import java.util.Set;
*/
public interface MatchStrategy<T> {
Map<MatchText, List<T>> match(QueryContext queryContext, List<Term> terms, Set<Long> detectModelId);
Map<MatchText, List<T>> match(QueryContext queryContext, List<S2Term> terms, Set<Long> detectModelId);
}

View File

@@ -1,6 +1,6 @@
package com.tencent.supersonic.chat.core.mapper;
import com.tencent.supersonic.chat.api.pojo.SchemaElementType;
import com.tencent.supersonic.headless.api.pojo.SchemaElementType;
import java.io.Serializable;
import lombok.Data;
import lombok.ToString;

View File

@@ -1,13 +1,13 @@
package com.tencent.supersonic.chat.core.mapper;
import com.google.common.collect.Lists;
import com.tencent.supersonic.chat.api.pojo.SchemaElement;
import com.tencent.supersonic.headless.api.pojo.SchemaElement;
import com.tencent.supersonic.chat.api.pojo.SchemaElementMatch;
import com.tencent.supersonic.chat.api.pojo.SchemaElementType;
import com.tencent.supersonic.headless.api.pojo.SchemaElementType;
import com.tencent.supersonic.chat.api.pojo.SchemaMapInfo;
import com.tencent.supersonic.chat.api.pojo.request.QueryFilter;
import com.tencent.supersonic.chat.api.pojo.request.QueryFilters;
import com.tencent.supersonic.chat.core.knowledge.builder.BaseWordBuilder;
import com.tencent.supersonic.headless.core.knowledge.builder.BaseWordBuilder;
import com.tencent.supersonic.chat.core.pojo.QueryContext;
import com.tencent.supersonic.common.pojo.Constants;
import lombok.extern.slf4j.Slf4j;

View File

@@ -1,9 +1,9 @@
package com.tencent.supersonic.chat.core.mapper;
import com.google.common.collect.Lists;
import com.hankcs.hanlp.seg.common.Term;
import com.tencent.supersonic.chat.core.knowledge.HanlpMapResult;
import com.tencent.supersonic.chat.core.knowledge.SearchService;
import com.tencent.supersonic.headless.api.pojo.response.S2Term;
import com.tencent.supersonic.headless.core.knowledge.HanlpMapResult;
import com.tencent.supersonic.headless.core.knowledge.SearchService;
import com.tencent.supersonic.chat.core.pojo.QueryContext;
import com.tencent.supersonic.common.pojo.enums.DictWordType;
import java.util.List;
@@ -26,7 +26,7 @@ public class SearchMatchStrategy extends BaseMatchStrategy<HanlpMapResult> {
private static final int SEARCH_SIZE = 3;
@Override
public Map<MatchText, List<HanlpMapResult>> match(QueryContext queryContext, List<Term> originals,
public Map<MatchText, List<HanlpMapResult>> match(QueryContext queryContext, List<S2Term> originals,
Set<Long> detectModelIds) {
String text = queryContext.getQueryText();
Map<Integer, Integer> regOffsetToLength = getRegOffsetToLength(originals);
@@ -52,9 +52,9 @@ public class SearchMatchStrategy extends BaseMatchStrategy<HanlpMapResult> {
if (StringUtils.isNotEmpty(detectSegment)) {
List<HanlpMapResult> hanlpMapResults = SearchService.prefixSearch(detectSegment,
SearchService.SEARCH_SIZE, queryContext.getAgentId(), detectModelIds);
SearchService.SEARCH_SIZE, detectModelIds);
List<HanlpMapResult> suffixHanlpMapResults = SearchService.suffixSearch(
detectSegment, SEARCH_SIZE, queryContext.getAgentId(), detectModelIds);
detectSegment, SEARCH_SIZE, detectModelIds);
hanlpMapResults.addAll(suffixHanlpMapResults);
// remove entity name where search
hanlpMapResults = hanlpMapResults.stream().filter(entry -> {

View File

@@ -1,7 +1,7 @@
package com.tencent.supersonic.chat.core.parser;
import com.tencent.supersonic.auth.api.authentication.pojo.User;
import com.tencent.supersonic.chat.api.pojo.SchemaElement;
import com.tencent.supersonic.headless.api.pojo.SchemaElement;
import com.tencent.supersonic.chat.api.pojo.SemanticParseInfo;
import com.tencent.supersonic.chat.api.pojo.SemanticSchema;
import com.tencent.supersonic.chat.api.pojo.response.SqlInfo;

View File

@@ -3,7 +3,7 @@ package com.tencent.supersonic.chat.core.parser.plugin;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import com.tencent.supersonic.chat.api.pojo.SchemaElementMatch;
import com.tencent.supersonic.chat.api.pojo.SchemaElementType;
import com.tencent.supersonic.headless.api.pojo.SchemaElementType;
import com.tencent.supersonic.chat.api.pojo.SemanticParseInfo;
import com.tencent.supersonic.chat.api.pojo.request.QueryFilter;
import com.tencent.supersonic.chat.api.pojo.request.QueryFilters;

View File

@@ -1,7 +1,7 @@
package com.tencent.supersonic.chat.core.parser.sql.llm;
import com.tencent.supersonic.chat.api.pojo.SchemaElementMatch;
import com.tencent.supersonic.chat.api.pojo.SchemaElementType;
import com.tencent.supersonic.headless.api.pojo.SchemaElementType;
import com.tencent.supersonic.chat.api.pojo.SchemaMapInfo;
import com.tencent.supersonic.chat.core.pojo.QueryContext;
import com.tencent.supersonic.chat.core.query.SemanticQuery;

View File

@@ -1,16 +1,16 @@
package com.tencent.supersonic.chat.core.parser.sql.llm;
import com.google.common.collect.Lists;
import com.tencent.supersonic.chat.api.pojo.SchemaElement;
import com.tencent.supersonic.headless.api.pojo.SchemaElement;
import com.tencent.supersonic.chat.api.pojo.SchemaElementMatch;
import com.tencent.supersonic.chat.api.pojo.SchemaElementType;
import com.tencent.supersonic.headless.api.pojo.SchemaElementType;
import com.tencent.supersonic.chat.api.pojo.SemanticSchema;
import com.tencent.supersonic.chat.core.agent.Agent;
import com.tencent.supersonic.chat.core.agent.AgentToolType;
import com.tencent.supersonic.chat.core.agent.NL2SQLTool;
import com.tencent.supersonic.chat.core.config.LLMParserConfig;
import com.tencent.supersonic.chat.core.config.OptimizationConfig;
import com.tencent.supersonic.chat.core.knowledge.semantic.SemanticInterpreter;
import com.tencent.supersonic.chat.core.query.semantic.SemanticInterpreter;
import com.tencent.supersonic.chat.core.parser.SatisfactionChecker;
import com.tencent.supersonic.chat.core.pojo.QueryContext;
import com.tencent.supersonic.chat.core.query.llm.s2sql.LLMReq;

View File

@@ -1,7 +1,7 @@
package com.tencent.supersonic.chat.core.parser.sql.rule;
import com.tencent.supersonic.chat.api.pojo.SchemaElementMatch;
import com.tencent.supersonic.chat.api.pojo.SchemaElementType;
import com.tencent.supersonic.headless.api.pojo.SchemaElementType;
import com.tencent.supersonic.chat.core.parser.SemanticParser;
import com.tencent.supersonic.chat.core.pojo.ChatContext;
import com.tencent.supersonic.chat.core.pojo.QueryContext;

View File

@@ -3,9 +3,9 @@ package com.tencent.supersonic.chat.core.plugin;
import com.alibaba.fastjson.JSONObject;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import com.tencent.supersonic.chat.api.pojo.SchemaElement;
import com.tencent.supersonic.headless.api.pojo.SchemaElement;
import com.tencent.supersonic.chat.api.pojo.SchemaElementMatch;
import com.tencent.supersonic.chat.api.pojo.SchemaElementType;
import com.tencent.supersonic.headless.api.pojo.SchemaElementType;
import com.tencent.supersonic.chat.api.pojo.SchemaMapInfo;
import com.tencent.supersonic.chat.core.agent.Agent;
import com.tencent.supersonic.chat.core.agent.AgentToolType;

View File

@@ -6,7 +6,7 @@ import com.tencent.supersonic.chat.api.pojo.SemanticParseInfo;
import com.tencent.supersonic.chat.api.pojo.SemanticSchema;
import com.tencent.supersonic.chat.api.pojo.response.SqlInfo;
import com.tencent.supersonic.chat.core.config.OptimizationConfig;
import com.tencent.supersonic.chat.core.knowledge.semantic.SemanticInterpreter;
import com.tencent.supersonic.chat.core.query.semantic.SemanticInterpreter;
import com.tencent.supersonic.chat.core.utils.ComponentFactory;
import com.tencent.supersonic.chat.core.utils.QueryReqBuilder;
import com.tencent.supersonic.common.pojo.Aggregator;

View File

@@ -2,14 +2,14 @@ package com.tencent.supersonic.chat.core.query.llm.analytics;
import com.google.common.collect.Lists;
import com.tencent.supersonic.auth.api.authentication.pojo.User;
import com.tencent.supersonic.chat.api.pojo.SchemaElement;
import com.tencent.supersonic.headless.api.pojo.SchemaElement;
import com.tencent.supersonic.chat.api.pojo.SchemaElementMatch;
import com.tencent.supersonic.chat.api.pojo.SchemaElementType;
import com.tencent.supersonic.headless.api.pojo.SchemaElementType;
import com.tencent.supersonic.chat.api.pojo.SemanticSchema;
import com.tencent.supersonic.chat.api.pojo.response.QueryResult;
import com.tencent.supersonic.chat.api.pojo.response.QueryState;
import com.tencent.supersonic.chat.core.config.OptimizationConfig;
import com.tencent.supersonic.chat.core.knowledge.semantic.SemanticInterpreter;
import com.tencent.supersonic.chat.core.query.semantic.SemanticInterpreter;
import com.tencent.supersonic.chat.core.query.QueryManager;
import com.tencent.supersonic.chat.core.query.llm.LLMSemanticQuery;
import com.tencent.supersonic.chat.core.utils.ComponentFactory;

View File

@@ -2,7 +2,7 @@ package com.tencent.supersonic.chat.core.query.llm.s2sql;
import com.tencent.supersonic.auth.api.authentication.pojo.User;
import com.tencent.supersonic.chat.api.pojo.SemanticSchema;
import com.tencent.supersonic.chat.core.knowledge.semantic.SemanticInterpreter;
import com.tencent.supersonic.chat.core.query.semantic.SemanticInterpreter;
import com.tencent.supersonic.chat.api.pojo.response.QueryResult;
import com.tencent.supersonic.chat.api.pojo.response.QueryState;
import com.tencent.supersonic.chat.api.pojo.response.SqlInfo;

View File

@@ -3,7 +3,7 @@ package com.tencent.supersonic.chat.core.query.plugin;
import com.google.common.collect.Lists;
import com.tencent.supersonic.auth.api.authentication.pojo.User;
import com.tencent.supersonic.chat.api.pojo.SchemaElementMatch;
import com.tencent.supersonic.chat.api.pojo.SchemaElementType;
import com.tencent.supersonic.headless.api.pojo.SchemaElementType;
import com.tencent.supersonic.chat.api.pojo.SemanticSchema;
import com.tencent.supersonic.chat.api.pojo.request.QueryFilter;
import com.tencent.supersonic.chat.api.pojo.request.QueryFilters;

View File

@@ -1,7 +1,7 @@
package com.tencent.supersonic.chat.core.query.rule;
import com.tencent.supersonic.chat.api.pojo.SchemaElementMatch;
import com.tencent.supersonic.chat.api.pojo.SchemaElementType;
import com.tencent.supersonic.headless.api.pojo.SchemaElementType;
import com.tencent.supersonic.common.pojo.enums.AggregateTypeEnum;
import lombok.Data;
import lombok.ToString;

View File

@@ -2,16 +2,16 @@
package com.tencent.supersonic.chat.core.query.rule;
import com.tencent.supersonic.auth.api.authentication.pojo.User;
import com.tencent.supersonic.chat.api.pojo.SchemaElement;
import com.tencent.supersonic.headless.api.pojo.SchemaElement;
import com.tencent.supersonic.chat.api.pojo.SchemaElementMatch;
import com.tencent.supersonic.chat.api.pojo.SchemaElementType;
import com.tencent.supersonic.headless.api.pojo.SchemaElementType;
import com.tencent.supersonic.chat.api.pojo.SemanticParseInfo;
import com.tencent.supersonic.chat.api.pojo.SemanticSchema;
import com.tencent.supersonic.chat.api.pojo.request.QueryFilter;
import com.tencent.supersonic.chat.api.pojo.response.QueryResult;
import com.tencent.supersonic.chat.api.pojo.response.QueryState;
import com.tencent.supersonic.chat.core.config.OptimizationConfig;
import com.tencent.supersonic.chat.core.knowledge.semantic.SemanticInterpreter;
import com.tencent.supersonic.chat.core.query.semantic.SemanticInterpreter;
import com.tencent.supersonic.chat.core.pojo.ChatContext;
import com.tencent.supersonic.chat.core.pojo.QueryContext;
import com.tencent.supersonic.chat.core.query.BaseSemanticQuery;

View File

@@ -1,6 +1,6 @@
package com.tencent.supersonic.chat.core.query.rule.metric;
import static com.tencent.supersonic.chat.api.pojo.SchemaElementType.VALUE;
import static com.tencent.supersonic.headless.api.pojo.SchemaElementType.VALUE;
import static com.tencent.supersonic.chat.core.query.rule.QueryMatchOption.OptionType.REQUIRED;
import static com.tencent.supersonic.chat.core.query.rule.QueryMatchOption.RequireNumberType.AT_LEAST;

View File

@@ -2,8 +2,8 @@ package com.tencent.supersonic.chat.core.query.rule.metric;
import org.springframework.stereotype.Component;
import static com.tencent.supersonic.chat.api.pojo.SchemaElementType.DIMENSION;
import static com.tencent.supersonic.chat.api.pojo.SchemaElementType.VALUE;
import static com.tencent.supersonic.headless.api.pojo.SchemaElementType.DIMENSION;
import static com.tencent.supersonic.headless.api.pojo.SchemaElementType.VALUE;
import static com.tencent.supersonic.chat.core.query.rule.QueryMatchOption.OptionType.OPTIONAL;
import static com.tencent.supersonic.chat.core.query.rule.QueryMatchOption.RequireNumberType.AT_LEAST;
import static com.tencent.supersonic.chat.core.query.rule.QueryMatchOption.OptionType.REQUIRED;

View File

@@ -4,7 +4,7 @@ import static com.tencent.supersonic.chat.core.query.rule.QueryMatchOption.Optio
import static com.tencent.supersonic.chat.core.query.rule.QueryMatchOption.RequireNumberType.AT_MOST;
import com.tencent.supersonic.auth.api.authentication.pojo.User;
import com.tencent.supersonic.chat.api.pojo.SchemaElementType;
import com.tencent.supersonic.headless.api.pojo.SchemaElementType;
import com.tencent.supersonic.chat.api.pojo.response.QueryResult;
import org.springframework.stereotype.Component;
@Component

View File

@@ -1,8 +1,8 @@
package com.tencent.supersonic.chat.core.query.rule.metric;
import static com.tencent.supersonic.chat.api.pojo.SchemaElementType.METRIC;
import static com.tencent.supersonic.chat.core.query.rule.QueryMatchOption.OptionType.REQUIRED;
import static com.tencent.supersonic.chat.core.query.rule.QueryMatchOption.RequireNumberType.AT_LEAST;
import static com.tencent.supersonic.headless.api.pojo.SchemaElementType.METRIC;
import com.tencent.supersonic.chat.api.pojo.SchemaElementMatch;
import com.tencent.supersonic.chat.api.pojo.ViewSchema;
@@ -29,7 +29,7 @@ public abstract class MetricSemanticQuery extends RuleSemanticQuery {
@Override
public List<SchemaElementMatch> match(List<SchemaElementMatch> candidateElementMatches,
QueryContext queryCtx) {
QueryContext queryCtx) {
return super.match(candidateElementMatches, queryCtx);
}

View File

@@ -1,7 +1,7 @@
package com.tencent.supersonic.chat.core.query.rule.metric;
import static com.tencent.supersonic.chat.api.pojo.SchemaElementType.ENTITY;
import static com.tencent.supersonic.chat.api.pojo.SchemaElementType.ID;
import static com.tencent.supersonic.headless.api.pojo.SchemaElementType.ENTITY;
import static com.tencent.supersonic.headless.api.pojo.SchemaElementType.ID;
import static com.tencent.supersonic.chat.core.query.rule.QueryMatchOption.OptionType.REQUIRED;
import static com.tencent.supersonic.chat.core.query.rule.QueryMatchOption.RequireNumberType.AT_LEAST;

View File

@@ -1,13 +1,13 @@
package com.tencent.supersonic.chat.core.query.rule.metric;
import static com.tencent.supersonic.chat.api.pojo.SchemaElementType.DIMENSION;
import static com.tencent.supersonic.chat.api.pojo.SchemaElementType.VALUE;
import static com.tencent.supersonic.headless.api.pojo.SchemaElementType.DIMENSION;
import static com.tencent.supersonic.headless.api.pojo.SchemaElementType.VALUE;
import static com.tencent.supersonic.chat.core.query.rule.QueryMatchOption.OptionType.OPTIONAL;
import static com.tencent.supersonic.chat.core.query.rule.QueryMatchOption.OptionType.REQUIRED;
import static com.tencent.supersonic.chat.core.query.rule.QueryMatchOption.RequireNumberType.AT_LEAST;
import static com.tencent.supersonic.common.pojo.Constants.DESC_UPPER;
import com.tencent.supersonic.chat.api.pojo.SchemaElement;
import com.tencent.supersonic.headless.api.pojo.SchemaElement;
import com.tencent.supersonic.chat.api.pojo.SchemaElementMatch;
import com.tencent.supersonic.chat.core.pojo.ChatContext;
import com.tencent.supersonic.chat.core.pojo.QueryContext;

View File

@@ -1,7 +1,7 @@
package com.tencent.supersonic.chat.core.query.rule.tag;
import static com.tencent.supersonic.chat.api.pojo.SchemaElementType.DIMENSION;
import static com.tencent.supersonic.chat.api.pojo.SchemaElementType.ID;
import static com.tencent.supersonic.headless.api.pojo.SchemaElementType.DIMENSION;
import static com.tencent.supersonic.headless.api.pojo.SchemaElementType.ID;
import static com.tencent.supersonic.chat.core.query.rule.QueryMatchOption.RequireNumberType.AT_LEAST;
import static com.tencent.supersonic.chat.core.query.rule.QueryMatchOption.OptionType.REQUIRED;

View File

@@ -1,6 +1,6 @@
package com.tencent.supersonic.chat.core.query.rule.tag;
import static com.tencent.supersonic.chat.api.pojo.SchemaElementType.VALUE;
import static com.tencent.supersonic.headless.api.pojo.SchemaElementType.VALUE;
import static com.tencent.supersonic.chat.core.query.rule.QueryMatchOption.OptionType.REQUIRED;
import static com.tencent.supersonic.chat.core.query.rule.QueryMatchOption.RequireNumberType.AT_LEAST;

View File

@@ -1,6 +1,6 @@
package com.tencent.supersonic.chat.core.query.rule.tag;
import static com.tencent.supersonic.chat.api.pojo.SchemaElementType.ID;
import static com.tencent.supersonic.headless.api.pojo.SchemaElementType.ID;
import static com.tencent.supersonic.chat.core.query.rule.QueryMatchOption.OptionType.REQUIRED;
import static com.tencent.supersonic.chat.core.query.rule.QueryMatchOption.RequireNumberType.AT_LEAST;

View File

@@ -1,7 +1,7 @@
package com.tencent.supersonic.chat.core.query.rule.tag;
import com.tencent.supersonic.chat.api.pojo.SchemaElement;
import com.tencent.supersonic.chat.api.pojo.SchemaElementType;
import com.tencent.supersonic.headless.api.pojo.SchemaElement;
import com.tencent.supersonic.headless.api.pojo.SchemaElementType;
import com.tencent.supersonic.chat.api.pojo.SemanticParseInfo;
import com.tencent.supersonic.chat.api.pojo.ViewSchema;
import com.tencent.supersonic.chat.core.pojo.ChatContext;

View File

@@ -15,7 +15,7 @@ import java.time.LocalDate;
import java.util.List;
import java.util.Objects;
import static com.tencent.supersonic.chat.api.pojo.SchemaElementType.ENTITY;
import static com.tencent.supersonic.headless.api.pojo.SchemaElementType.ENTITY;
import static com.tencent.supersonic.chat.core.query.rule.QueryMatchOption.OptionType.REQUIRED;
import static com.tencent.supersonic.chat.core.query.rule.QueryMatchOption.RequireNumberType.AT_LEAST;

View File

@@ -1,4 +1,4 @@
package com.tencent.supersonic.chat.core.knowledge.semantic;
package com.tencent.supersonic.chat.core.query.semantic;
import com.google.common.cache.Cache;
import com.google.common.cache.CacheBuilder;

View File

@@ -1,4 +1,4 @@
package com.tencent.supersonic.chat.core.knowledge.semantic;
package com.tencent.supersonic.chat.core.query.semantic;
import com.github.pagehelper.PageInfo;
import com.tencent.supersonic.auth.api.authentication.pojo.User;

View File

@@ -1,4 +1,4 @@
package com.tencent.supersonic.chat.core.knowledge.semantic;
package com.tencent.supersonic.chat.core.query.semantic;
import com.github.pagehelper.PageInfo;
import com.google.gson.Gson;

View File

@@ -1,4 +1,4 @@
package com.tencent.supersonic.chat.core.knowledge.semantic;
package com.tencent.supersonic.chat.core.query.semantic;
import com.github.pagehelper.PageInfo;
import com.tencent.supersonic.auth.api.authentication.pojo.User;

View File

@@ -1,10 +1,10 @@
package com.tencent.supersonic.chat.core.knowledge.semantic;
package com.tencent.supersonic.chat.core.query.semantic;
import com.google.common.collect.Lists;
import com.tencent.supersonic.chat.api.pojo.RelatedSchemaElement;
import com.tencent.supersonic.chat.api.pojo.SchemaElement;
import com.tencent.supersonic.chat.api.pojo.SchemaElementType;
import com.tencent.supersonic.chat.api.pojo.SchemaValueMap;
import com.tencent.supersonic.headless.api.pojo.RelatedSchemaElement;
import com.tencent.supersonic.headless.api.pojo.SchemaElement;
import com.tencent.supersonic.headless.api.pojo.SchemaElementType;
import com.tencent.supersonic.headless.api.pojo.SchemaValueMap;
import com.tencent.supersonic.chat.api.pojo.ViewSchema;
import com.tencent.supersonic.headless.api.pojo.DimValueMap;
import com.tencent.supersonic.headless.api.pojo.RelateDimension;

View File

@@ -1,6 +1,6 @@
package com.tencent.supersonic.chat.core.utils;
import com.tencent.supersonic.chat.core.knowledge.semantic.SemanticInterpreter;
import com.tencent.supersonic.chat.core.query.semantic.SemanticInterpreter;
import com.tencent.supersonic.chat.core.parser.JavaLLMProxy;
import com.tencent.supersonic.chat.core.parser.LLMProxy;
import com.tencent.supersonic.chat.core.parser.sql.llm.ViewResolver;

View File

@@ -1,219 +0,0 @@
package com.tencent.supersonic.chat.core.utils;
import com.tencent.supersonic.auth.api.authentication.pojo.User;
import com.tencent.supersonic.chat.core.config.DefaultMetric;
import com.tencent.supersonic.chat.core.config.Dim4Dict;
import com.tencent.supersonic.chat.core.knowledge.semantic.SemanticInterpreter;
import com.tencent.supersonic.common.pojo.Aggregator;
import com.tencent.supersonic.common.pojo.Constants;
import com.tencent.supersonic.common.pojo.DateConf;
import com.tencent.supersonic.common.pojo.Filter;
import com.tencent.supersonic.common.pojo.Order;
import com.tencent.supersonic.common.pojo.QueryColumn;
import com.tencent.supersonic.common.pojo.enums.AggOperatorEnum;
import com.tencent.supersonic.common.pojo.enums.FilterOperatorEnum;
import com.tencent.supersonic.headless.api.pojo.request.QueryStructReq;
import com.tencent.supersonic.headless.api.pojo.response.SemanticQueryResp;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.util.Strings;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Component;
import org.springframework.util.CollectionUtils;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.StringJoiner;
import static com.tencent.supersonic.common.pojo.Constants.AND_UPPER;
import static com.tencent.supersonic.common.pojo.Constants.APOSTROPHE;
import static com.tencent.supersonic.common.pojo.Constants.COMMA;
import static com.tencent.supersonic.common.pojo.Constants.SPACE;
import static com.tencent.supersonic.common.pojo.Constants.UNDERLINE_DOUBLE;
@Slf4j
@Component
public class DictQueryHelper {
private static final Long MAX_FREQUENCY = 99999999L;
private SemanticInterpreter semanticInterpreter = ComponentFactory.getSemanticLayer();
@Value("${dimension.multi.value.split:#}")
private String dimMultiValueSplit;
@Value("${dimension.value.show:50}")
private Integer printDataShow;
@Value("${dimension.max.limit:3000000}")
private Long dimMaxLimit;
@Value("${dimension.white.weight:60000000}")
private Long dimensionWhiteWeight;
public List<String> fetchDimValueSingle(Long modelId, DefaultMetric defaultMetricDesc, Dim4Dict dim4Dict,
User user) {
List<String> data = new ArrayList<>();
QueryStructReq queryStructCmd = generateQueryStructCmd(modelId, defaultMetricDesc, dim4Dict);
try {
SemanticQueryResp semanticQueryResp = semanticInterpreter.queryByStruct(queryStructCmd, user);
log.info("fetchDimValueSingle sql:{}", semanticQueryResp.getSql());
String nature = String.format("_%d_%d", modelId, dim4Dict.getDimId());
String dimNameRewrite = rewriteDimName(semanticQueryResp.getColumns(), dim4Dict.getBizName());
data = generateFileData(semanticQueryResp.getResultList(), nature, dimNameRewrite,
defaultMetricDesc.getBizName(), dim4Dict);
if (!CollectionUtils.isEmpty(data)) {
int size = (data.size() > printDataShow) ? printDataShow : data.size();
log.info("data:{}", data.subList(0, size));
} else {
log.warn("data is empty. nature:{}", nature);
if (Objects.nonNull(semanticQueryResp)) {
log.warn("sql:{}", semanticQueryResp.getSql());
}
}
} catch (Exception e) {
log.warn("fetchDimValueSingle,e:", e);
}
return data;
}
private String rewriteDimName(List<QueryColumn> columns, String bizName) {
// metric parser join dimension style
String dimNameRewrite = bizName;
if (!CollectionUtils.isEmpty(columns)) {
for (QueryColumn column : columns) {
if (StringUtils.isNotEmpty(column.getNameEn())) {
String nameEn = column.getNameEn();
if (nameEn.endsWith(UNDERLINE_DOUBLE + bizName)) {
dimNameRewrite = nameEn;
}
}
}
}
return dimNameRewrite;
}
private List<String> generateFileData(List<Map<String, Object>> resultList, String nature, String dimName,
String metricName, Dim4Dict dim4Dict) {
List<String> data = new ArrayList<>();
if (CollectionUtils.isEmpty(resultList)) {
return data;
}
Map<String, Long> valueAndFrequencyPair = new HashMap<>(2000);
for (Map<String, Object> line : resultList) {
if (CollectionUtils.isEmpty(line) || !line.containsKey(dimName)
|| !line.containsKey(metricName)
|| line.get(dimName) == null) {
continue;
}
String dimValue = line.get(dimName).toString();
Object metricObject = line.get(metricName);
if (Strings.isNotEmpty(dimValue) && Objects.nonNull(metricObject)) {
Long metric = Math.round(Double.parseDouble(metricObject.toString()));
mergeMultivaluedValue(valueAndFrequencyPair, dimValue, metric);
}
}
constructDataLines(valueAndFrequencyPair, nature, data, dim4Dict);
return data;
}
private void constructDataLines(Map<String, Long> valueAndFrequencyPair, String nature,
List<String> data, Dim4Dict dim4Dict) {
valueAndFrequencyPair.forEach((dimValue, metric) -> {
if (metric > MAX_FREQUENCY) {
metric = MAX_FREQUENCY;
}
if (Strings.isNotEmpty(dimValue) && dimValue.contains(SPACE)) {
dimValue = dimValue.replace(SPACE, "#");
}
data.add(String.format("%s %s %s", dimValue, nature, metric));
});
if (Objects.nonNull(dim4Dict) && !CollectionUtils.isEmpty(dim4Dict.getWhiteList())) {
dim4Dict.getWhiteList().stream()
.forEach(white -> data.add(String.format("%s %s %s", white, nature, dimensionWhiteWeight)));
}
}
private void mergeMultivaluedValue(Map<String, Long> valueAndFrequencyPair, String dimValue, Long metric) {
if (Strings.isEmpty(dimValue)) {
return;
}
Map<String, Long> tmp = new HashMap<>();
if (dimValue.contains(dimMultiValueSplit)) {
Arrays.stream(dimValue.split(dimMultiValueSplit))
.forEach(dimValueSingle -> tmp.put(dimValueSingle, metric));
} else {
tmp.put(dimValue, metric);
}
for (String value : tmp.keySet()) {
long metricOld = valueAndFrequencyPair.containsKey(value) ? valueAndFrequencyPair.get(value) : 0L;
valueAndFrequencyPair.put(value, metric + metricOld);
}
}
private QueryStructReq generateQueryStructCmd(Long modelId, DefaultMetric defaultMetricDesc, Dim4Dict dim4Dict) {
QueryStructReq queryStructCmd = new QueryStructReq();
List<Filter> filters = generateFilters(dim4Dict, queryStructCmd);
queryStructCmd.setDimensionFilters(filters);
List<Aggregator> aggregators = new ArrayList<>();
aggregators.add(new Aggregator(defaultMetricDesc.getBizName(), AggOperatorEnum.SUM));
queryStructCmd.setAggregators(aggregators);
List<Order> orders = new ArrayList<>();
orders.add(new Order(defaultMetricDesc.getBizName(), Constants.DESC_UPPER));
queryStructCmd.setOrders(orders);
DateConf dateInfo = new DateConf();
dateInfo.setDateMode(DateConf.DateMode.RECENT);
log.debug("defaultMetric unit():{}", defaultMetricDesc.getUnit());
dateInfo.setUnit(defaultMetricDesc.getUnit());
dateInfo.setPeriod(defaultMetricDesc.getPeriod());
queryStructCmd.setDateInfo(dateInfo);
queryStructCmd.setLimit(dimMaxLimit);
queryStructCmd.setNeedAuth(false);
return queryStructCmd;
}
private List<Filter> generateFilters(Dim4Dict dim4Dict, QueryStructReq queryStructCmd) {
String whereStr = generateFilter(dim4Dict);
if (Strings.isEmpty(whereStr)) {
return new ArrayList<>();
}
Filter filter = new Filter("", FilterOperatorEnum.SQL_PART, whereStr);
List<Filter> filters = Objects.isNull(queryStructCmd.getOriginalFilter()) ? new ArrayList<>()
: queryStructCmd.getOriginalFilter();
filters.add(filter);
return filters;
}
private String generateFilter(Dim4Dict dim4Dict) {
if (Objects.isNull(dim4Dict)) {
return "";
}
StringJoiner joiner = new StringJoiner(SPACE + AND_UPPER + SPACE);
String dimName = dim4Dict.getBizName();
if (!CollectionUtils.isEmpty(dim4Dict.getBlackList())) {
StringJoiner joinerBlack = new StringJoiner(COMMA);
dim4Dict.getBlackList().stream().forEach(black -> joinerBlack.add(APOSTROPHE + black + APOSTROPHE));
joiner.add(String.format("(%s not in (%s))", dimName, joinerBlack.toString()));
}
if (!CollectionUtils.isEmpty(dim4Dict.getRuleList())) {
dim4Dict.getRuleList().stream().forEach(rule -> joiner.add(rule));
}
return joiner.toString();
}
}

View File

@@ -1,82 +0,0 @@
package com.tencent.supersonic.chat.core.utils;
import static com.hankcs.hanlp.HanLP.Config.CustomDictionaryPath;
import com.hankcs.hanlp.dictionary.DynamicCustomDictionary;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import lombok.extern.slf4j.Slf4j;
@Slf4j
public class FileHelper {
public static final String FILE_SPILT = File.separator;
public static void deleteCacheFile(String[] path) throws IOException {
String customPath = getCustomPath(path);
File customFolder = new File(customPath);
File[] customSubFiles = getFileList(customFolder, ".bin");
for (File file : customSubFiles) {
try {
file.delete();
log.info("customPath:{},delete file:{}", customPath, file);
} catch (Exception e) {
log.error("delete " + file, e);
}
}
}
private static File[] getFileList(File customFolder, String suffix) {
File[] customSubFiles = customFolder.listFiles(file -> {
if (file.isDirectory()) {
return false;
}
if (file.getName().toLowerCase().endsWith(suffix)) {
return true;
}
return false;
});
return customSubFiles;
}
private static String getCustomPath(String[] path) {
return path[0].substring(0, path[0].lastIndexOf(FILE_SPILT)) + FILE_SPILT;
}
/**
* reset path
*
* @param customDictionary
*/
public static void resetCustomPath(DynamicCustomDictionary customDictionary) {
String[] path = CustomDictionaryPath;
String customPath = getCustomPath(path);
File customFolder = new File(customPath);
File[] customSubFiles = getFileList(customFolder, ".txt");
List<String> fileList = new ArrayList<>();
for (File file : customSubFiles) {
if (file.isFile()) {
fileList.add(file.getAbsolutePath());
}
}
log.debug("CustomDictionaryPath:{}", fileList);
CustomDictionaryPath = fileList.toArray(new String[0]);
customDictionary.path = (CustomDictionaryPath == null || CustomDictionaryPath.length == 0) ? path
: CustomDictionaryPath;
if (CustomDictionaryPath == null || CustomDictionaryPath.length == 0) {
CustomDictionaryPath = path;
}
}
}

View File

@@ -1,215 +0,0 @@
package com.tencent.supersonic.chat.core.utils;
import static com.hankcs.hanlp.HanLP.Config.CustomDictionaryPath;
import com.hankcs.hanlp.HanLP;
import com.hankcs.hanlp.corpus.tag.Nature;
import com.hankcs.hanlp.dictionary.CoreDictionary;
import com.hankcs.hanlp.dictionary.DynamicCustomDictionary;
import com.hankcs.hanlp.seg.Segment;
import com.hankcs.hanlp.seg.common.Term;
import com.tencent.supersonic.chat.core.knowledge.DictWord;
import com.tencent.supersonic.chat.core.knowledge.HadoopFileIOAdapter;
import com.tencent.supersonic.chat.core.knowledge.MapResult;
import com.tencent.supersonic.chat.core.knowledge.MultiCustomDictionary;
import com.tencent.supersonic.chat.core.knowledge.SearchService;
import com.tencent.supersonic.common.pojo.enums.DictWordType;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.stream.Collectors;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import org.springframework.util.CollectionUtils;
import org.springframework.util.ResourceUtils;
/**
* HanLP helper
*/
@Slf4j
public class HanlpHelper {
public static final String FILE_SPILT = File.separator;
public static final String SPACE_SPILT = "#";
private static volatile DynamicCustomDictionary CustomDictionary;
private static volatile Segment segment;
static {
// reset hanlp config
try {
resetHanlpConfig();
} catch (FileNotFoundException e) {
log.error("resetHanlpConfig error", e);
}
}
public static Segment getSegment() {
if (segment == null) {
synchronized (HanlpHelper.class) {
if (segment == null) {
segment = HanLP.newSegment()
.enableIndexMode(true).enableIndexMode(4)
.enableCustomDictionary(true).enableCustomDictionaryForcing(true).enableOffset(true)
.enableJapaneseNameRecognize(false).enableNameRecognize(false)
.enableAllNamedEntityRecognize(false)
.enableJapaneseNameRecognize(false).enableNumberQuantifierRecognize(false)
.enablePlaceRecognize(false)
.enableOrganizationRecognize(false).enableCustomDictionary(getDynamicCustomDictionary());
}
}
}
return segment;
}
public static DynamicCustomDictionary getDynamicCustomDictionary() {
if (CustomDictionary == null) {
synchronized (HanlpHelper.class) {
if (CustomDictionary == null) {
CustomDictionary = new MultiCustomDictionary(CustomDictionaryPath);
}
}
}
return CustomDictionary;
}
/***
* reload custom dictionary
*/
public static boolean reloadCustomDictionary() throws IOException {
log.info("reloadCustomDictionary start");
final long startTime = System.currentTimeMillis();
if (CustomDictionaryPath == null || CustomDictionaryPath.length == 0) {
return false;
}
if (HanLP.Config.IOAdapter instanceof HadoopFileIOAdapter) {
// 1.delete hdfs file
HdfsFileHelper.deleteCacheFile(CustomDictionaryPath);
// 2.query txt filesupdate CustomDictionaryPath
HdfsFileHelper.resetCustomPath(getDynamicCustomDictionary());
} else {
FileHelper.deleteCacheFile(CustomDictionaryPath);
FileHelper.resetCustomPath(getDynamicCustomDictionary());
}
// 3.clear trie
SearchService.clear();
boolean reload = getDynamicCustomDictionary().reload();
log.info("reloadCustomDictionary end ,cost:{},reload:{}", System.currentTimeMillis() - startTime, reload);
return reload;
}
private static void resetHanlpConfig() throws FileNotFoundException {
if (HanLP.Config.IOAdapter instanceof HadoopFileIOAdapter) {
return;
}
String hanlpPropertiesPath = getHanlpPropertiesPath();
CustomDictionaryPath = Arrays.stream(CustomDictionaryPath).map(path -> hanlpPropertiesPath + FILE_SPILT + path)
.toArray(String[]::new);
log.info("hanlpPropertiesPath:{},CustomDictionaryPath:{}", hanlpPropertiesPath, CustomDictionaryPath);
HanLP.Config.CoreDictionaryPath = hanlpPropertiesPath + FILE_SPILT + HanLP.Config.BiGramDictionaryPath;
HanLP.Config.CoreDictionaryTransformMatrixDictionaryPath = hanlpPropertiesPath + FILE_SPILT
+ HanLP.Config.CoreDictionaryTransformMatrixDictionaryPath;
HanLP.Config.BiGramDictionaryPath = hanlpPropertiesPath + FILE_SPILT + HanLP.Config.BiGramDictionaryPath;
HanLP.Config.CoreStopWordDictionaryPath =
hanlpPropertiesPath + FILE_SPILT + HanLP.Config.CoreStopWordDictionaryPath;
HanLP.Config.CoreSynonymDictionaryDictionaryPath = hanlpPropertiesPath + FILE_SPILT
+ HanLP.Config.CoreSynonymDictionaryDictionaryPath;
HanLP.Config.PersonDictionaryPath = hanlpPropertiesPath + FILE_SPILT + HanLP.Config.PersonDictionaryPath;
HanLP.Config.PersonDictionaryTrPath = hanlpPropertiesPath + FILE_SPILT + HanLP.Config.PersonDictionaryTrPath;
HanLP.Config.PinyinDictionaryPath = hanlpPropertiesPath + FILE_SPILT + HanLP.Config.PinyinDictionaryPath;
HanLP.Config.TranslatedPersonDictionaryPath = hanlpPropertiesPath + FILE_SPILT
+ HanLP.Config.TranslatedPersonDictionaryPath;
HanLP.Config.JapanesePersonDictionaryPath = hanlpPropertiesPath + FILE_SPILT
+ HanLP.Config.JapanesePersonDictionaryPath;
HanLP.Config.PlaceDictionaryPath = hanlpPropertiesPath + FILE_SPILT + HanLP.Config.PlaceDictionaryPath;
HanLP.Config.PlaceDictionaryTrPath = hanlpPropertiesPath + FILE_SPILT + HanLP.Config.PlaceDictionaryTrPath;
HanLP.Config.OrganizationDictionaryPath = hanlpPropertiesPath + FILE_SPILT
+ HanLP.Config.OrganizationDictionaryPath;
HanLP.Config.OrganizationDictionaryTrPath = hanlpPropertiesPath + FILE_SPILT
+ HanLP.Config.OrganizationDictionaryTrPath;
HanLP.Config.CharTypePath = hanlpPropertiesPath + FILE_SPILT + HanLP.Config.CharTypePath;
HanLP.Config.CharTablePath = hanlpPropertiesPath + FILE_SPILT + HanLP.Config.CharTablePath;
HanLP.Config.PartOfSpeechTagDictionary =
hanlpPropertiesPath + FILE_SPILT + HanLP.Config.PartOfSpeechTagDictionary;
HanLP.Config.WordNatureModelPath = hanlpPropertiesPath + FILE_SPILT + HanLP.Config.WordNatureModelPath;
HanLP.Config.MaxEntModelPath = hanlpPropertiesPath + FILE_SPILT + HanLP.Config.MaxEntModelPath;
HanLP.Config.NNParserModelPath = hanlpPropertiesPath + FILE_SPILT + HanLP.Config.NNParserModelPath;
HanLP.Config.PerceptronParserModelPath =
hanlpPropertiesPath + FILE_SPILT + HanLP.Config.PerceptronParserModelPath;
HanLP.Config.CRFSegmentModelPath = hanlpPropertiesPath + FILE_SPILT + HanLP.Config.CRFSegmentModelPath;
HanLP.Config.HMMSegmentModelPath = hanlpPropertiesPath + FILE_SPILT + HanLP.Config.HMMSegmentModelPath;
HanLP.Config.CRFCWSModelPath = hanlpPropertiesPath + FILE_SPILT + HanLP.Config.CRFCWSModelPath;
HanLP.Config.CRFPOSModelPath = hanlpPropertiesPath + FILE_SPILT + HanLP.Config.CRFPOSModelPath;
HanLP.Config.CRFNERModelPath = hanlpPropertiesPath + FILE_SPILT + HanLP.Config.CRFNERModelPath;
HanLP.Config.PerceptronCWSModelPath = hanlpPropertiesPath + FILE_SPILT + HanLP.Config.PerceptronCWSModelPath;
HanLP.Config.PerceptronPOSModelPath = hanlpPropertiesPath + FILE_SPILT + HanLP.Config.PerceptronPOSModelPath;
HanLP.Config.PerceptronNERModelPath = hanlpPropertiesPath + FILE_SPILT + HanLP.Config.PerceptronNERModelPath;
}
public static String getHanlpPropertiesPath() throws FileNotFoundException {
return ResourceUtils.getFile("classpath:hanlp.properties").getParent();
}
public static boolean addToCustomDictionary(DictWord dictWord) {
log.info("dictWord:{}", dictWord);
return getDynamicCustomDictionary().insert(dictWord.getWord(), dictWord.getNatureWithFrequency());
}
public static void removeFromCustomDictionary(DictWord dictWord) {
log.info("dictWord:{}", dictWord);
CoreDictionary.Attribute attribute = getDynamicCustomDictionary().get(dictWord.getWord());
if (attribute == null) {
return;
}
log.info("get attribute:{}", attribute);
getDynamicCustomDictionary().remove(dictWord.getWord());
StringBuilder sb = new StringBuilder();
List<Nature> natureList = new ArrayList<>();
for (int i = 0; i < attribute.nature.length; i++) {
if (!attribute.nature[i].toString().equals(dictWord.getNature())) {
sb.append(attribute.nature[i].toString() + " ");
sb.append(attribute.frequency[i] + " ");
natureList.add((attribute.nature[i]));
}
}
String natureWithFrequency = sb.toString();
int len = natureWithFrequency.length();
log.info("filtered natureWithFrequency:{}", natureWithFrequency);
if (StringUtils.isNotBlank(natureWithFrequency)) {
getDynamicCustomDictionary().add(dictWord.getWord(), natureWithFrequency.substring(0, len - 1));
}
SearchService.remove(dictWord, natureList.toArray(new Nature[0]));
}
public static <T extends MapResult> void transLetterOriginal(List<T> mapResults) {
if (CollectionUtils.isEmpty(mapResults)) {
return;
}
for (T mapResult : mapResults) {
if (MultiCustomDictionary.isLowerLetter(mapResult.getName())) {
if (CustomDictionary.contains(mapResult.getName())) {
CoreDictionary.Attribute attribute = CustomDictionary.get(mapResult.getName());
if (attribute != null && attribute.original != null) {
mapResult.setName(attribute.original);
}
}
}
}
}
public static List<Term> getTerms(String text) {
return getSegment().seg(text.toLowerCase()).stream()
.filter(term -> term.getNature().startsWith(DictWordType.NATURE_SPILT))
.collect(Collectors.toList());
}
}

View File

@@ -1,84 +0,0 @@
package com.tencent.supersonic.chat.core.utils;
import static com.hankcs.hanlp.HanLP.Config.CustomDictionaryPath;
import com.hankcs.hanlp.dictionary.DynamicCustomDictionary;
import com.hankcs.hanlp.utility.Predefine;
import java.io.IOException;
import java.net.URI;
import java.util.ArrayList;
import java.util.List;
import lombok.extern.slf4j.Slf4j;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
/**
* Hdfs File Helper
*/
@Slf4j
public class HdfsFileHelper {
/***
* delete cache file
* @param path
* @throws IOException
*/
public static void deleteCacheFile(String[] path) throws IOException {
FileSystem fs = FileSystem.get(URI.create(path[0]), new Configuration());
String cacheFilePath = path[0] + Predefine.BIN_EXT;
log.info("delete cache file:{}", cacheFilePath);
try {
fs.delete(new Path(cacheFilePath), false);
} catch (Exception e) {
log.error("delete:" + cacheFilePath, e);
}
int customBase = cacheFilePath.lastIndexOf(FileHelper.FILE_SPILT);
String customPath = cacheFilePath.substring(0, customBase) + FileHelper.FILE_SPILT + "*.bin";
List<String> fileList = getFileList(fs, new Path(customPath));
for (String file : fileList) {
try {
fs.delete(new Path(file), false);
log.info("delete cache file:{}", file);
} catch (Exception e) {
log.error("delete " + file, e);
}
}
log.info("fileList:{}", fileList);
}
/**
* reset path
*
* @param customDictionary
* @throws IOException
*/
public static void resetCustomPath(DynamicCustomDictionary customDictionary) throws IOException {
String[] path = CustomDictionaryPath;
FileSystem fs = FileSystem.get(URI.create(path[0]), new Configuration());
String cacheFilePath = path[0] + Predefine.BIN_EXT;
int customBase = cacheFilePath.lastIndexOf(FileHelper.FILE_SPILT);
String customPath = cacheFilePath.substring(0, customBase) + FileHelper.FILE_SPILT + "*.txt";
log.info("customPath:{}", customPath);
List<String> fileList = getFileList(fs, new Path(customPath));
log.info("CustomDictionaryPath:{}", fileList);
CustomDictionaryPath = fileList.toArray(new String[0]);
customDictionary.path = (CustomDictionaryPath == null || CustomDictionaryPath.length == 0) ? path
: CustomDictionaryPath;
if (CustomDictionaryPath == null || CustomDictionaryPath.length == 0) {
CustomDictionaryPath = path;
}
}
public static List<String> getFileList(FileSystem fs, Path folderPath) throws IOException {
List<String> paths = new ArrayList();
FileStatus[] fileStatuses = fs.globStatus(folderPath);
for (int i = 0; i < fileStatuses.length; i++) {
FileStatus fileStatus = fileStatuses[i];
paths.add(fileStatus.getPath().toString());
}
return paths;
}
}

View File

@@ -1,180 +0,0 @@
package com.tencent.supersonic.chat.core.utils;
import com.hankcs.hanlp.corpus.tag.Nature;
import com.hankcs.hanlp.seg.common.Term;
import com.tencent.supersonic.chat.api.pojo.SchemaElementType;
import com.tencent.supersonic.chat.core.knowledge.ViewInfoStat;
import com.tencent.supersonic.common.pojo.enums.DictWordType;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.stream.Collectors;
/**
* nature parse helper
*/
@Slf4j
public class NatureHelper {
public static SchemaElementType convertToElementType(String nature) {
DictWordType dictWordType = DictWordType.getNatureType(nature);
if (Objects.isNull(dictWordType)) {
return null;
}
SchemaElementType result = null;
switch (dictWordType) {
case METRIC:
result = SchemaElementType.METRIC;
break;
case DIMENSION:
result = SchemaElementType.DIMENSION;
break;
case ENTITY:
result = SchemaElementType.ENTITY;
break;
case VIEW:
result = SchemaElementType.VIEW;
break;
case VALUE:
result = SchemaElementType.VALUE;
break;
case TAG:
result = SchemaElementType.TAG;
break;
default:
break;
}
return result;
}
private static boolean isViewOrEntity(Term term, Integer model) {
return (DictWordType.NATURE_SPILT + model).equals(term.nature.toString()) || term.nature.toString()
.endsWith(DictWordType.ENTITY.getType());
}
public static Integer getViewByNature(Nature nature) {
if (nature.startsWith(DictWordType.NATURE_SPILT)) {
String[] dimensionValues = nature.toString().split(DictWordType.NATURE_SPILT);
if (StringUtils.isNumeric(dimensionValues[1])) {
return Integer.valueOf(dimensionValues[1]);
}
}
return 0;
}
public static Long getViewId(String nature) {
try {
String[] split = nature.split(DictWordType.NATURE_SPILT);
if (split.length <= 1) {
return null;
}
return Long.valueOf(split[1]);
} catch (NumberFormatException e) {
log.error("", e);
}
return null;
}
public static boolean isDimensionValueViewId(String nature) {
if (StringUtils.isEmpty(nature)) {
return false;
}
if (!nature.startsWith(DictWordType.NATURE_SPILT)) {
return false;
}
String[] split = nature.split(DictWordType.NATURE_SPILT);
if (split.length <= 1) {
return false;
}
return !nature.endsWith(DictWordType.METRIC.getType()) && !nature.endsWith(DictWordType.DIMENSION.getType())
&& StringUtils.isNumeric(split[1]);
}
public static ViewInfoStat getViewStat(List<Term> terms) {
return ViewInfoStat.builder()
.viewCount(getViewCount(terms))
.dimensionViewCount(getDimensionCount(terms))
.metricViewCount(getMetricCount(terms))
.dimensionValueViewCount(getDimensionValueCount(terms))
.build();
}
private static long getViewCount(List<Term> terms) {
return terms.stream().filter(term -> isViewOrEntity(term, getViewByNature(term.nature))).count();
}
private static long getDimensionValueCount(List<Term> terms) {
return terms.stream().filter(term -> isDimensionValueViewId(term.nature.toString())).count();
}
private static long getDimensionCount(List<Term> terms) {
return terms.stream().filter(term -> term.nature.startsWith(DictWordType.NATURE_SPILT) && term.nature.toString()
.endsWith(DictWordType.DIMENSION.getType())).count();
}
private static long getMetricCount(List<Term> terms) {
return terms.stream().filter(term -> term.nature.startsWith(DictWordType.NATURE_SPILT) && term.nature.toString()
.endsWith(DictWordType.METRIC.getType())).count();
}
/**
* Get the number of types of class parts of speech
* modelId -> (nature , natureCount)
*
* @param terms
* @return
*/
public static Map<Long, Map<DictWordType, Integer>> getViewToNatureStat(List<Term> terms) {
Map<Long, Map<DictWordType, Integer>> modelToNature = new HashMap<>();
terms.stream().filter(
term -> term.nature.startsWith(DictWordType.NATURE_SPILT)
).forEach(term -> {
DictWordType dictWordType = DictWordType.getNatureType(String.valueOf(term.nature));
Long model = getViewId(String.valueOf(term.nature));
Map<DictWordType, Integer> natureTypeMap = new HashMap<>();
natureTypeMap.put(dictWordType, 1);
Map<DictWordType, Integer> original = modelToNature.get(model);
if (Objects.isNull(original)) {
modelToNature.put(model, natureTypeMap);
} else {
Integer count = original.get(dictWordType);
if (Objects.isNull(count)) {
count = 1;
} else {
count = count + 1;
}
original.put(dictWordType, count);
}
});
return modelToNature;
}
public static List<Long> selectPossibleViews(List<Term> terms) {
Map<Long, Map<DictWordType, Integer>> modelToNatureStat = getViewToNatureStat(terms);
Integer maxViewTypeSize = modelToNatureStat.entrySet().stream()
.max(Comparator.comparingInt(o -> o.getValue().size())).map(entry -> entry.getValue().size())
.orElse(null);
if (Objects.isNull(maxViewTypeSize) || maxViewTypeSize == 0) {
return new ArrayList<>();
}
return modelToNatureStat.entrySet().stream().filter(entry -> entry.getValue().size() == maxViewTypeSize)
.map(entry -> entry.getKey()).collect(Collectors.toList());
}
public static Long getElementID(String nature) {
String[] split = nature.split(DictWordType.NATURE_SPILT);
if (split.length >= 3) {
return Long.valueOf(split[2]);
}
return 0L;
}
}

View File

@@ -1,7 +1,7 @@
package com.tencent.supersonic.chat.core.utils;
import com.google.common.collect.Lists;
import com.tencent.supersonic.chat.api.pojo.SchemaElement;
import com.tencent.supersonic.headless.api.pojo.SchemaElement;
import com.tencent.supersonic.chat.api.pojo.SemanticParseInfo;
import com.tencent.supersonic.chat.core.query.QueryManager;
import com.tencent.supersonic.common.pojo.Aggregator;

View File

@@ -1,7 +1,7 @@
package com.tencent.supersonic.chat.core.utils;
import com.tencent.supersonic.chat.api.pojo.SchemaElementMatch;
import com.tencent.supersonic.chat.api.pojo.SchemaElementType;
import com.tencent.supersonic.headless.api.pojo.SchemaElementType;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;

View File

@@ -1,2 +0,0 @@
com.tencent.supersonic.chat.core.knowledge.FileHandler=\
com.tencent.supersonic.chat.core.knowledge.LocalFileHandler

Some files were not shown because too many files have changed in this diff Show More