mirror of
https://github.com/tencentmusic/supersonic.git
synced 2025-12-12 20:51:48 +00:00
(improvement)(Chat) Move chat-core to headless (#805)
Co-authored-by: jolunoluo
This commit is contained in:
@@ -29,7 +29,28 @@
|
||||
<artifactId>ST4</artifactId>
|
||||
<version>${st.version}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.testng</groupId>
|
||||
<artifactId>testng</artifactId>
|
||||
<version>${org.testng.version}</version>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>junit</groupId>
|
||||
<artifactId>junit</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.junit.jupiter</groupId>
|
||||
<artifactId>junit-jupiter</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.mockito</groupId>
|
||||
<artifactId>mockito-inline</artifactId>
|
||||
<version>${mockito-inline.version}</version>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.springframework</groupId>
|
||||
<artifactId>spring-context</artifactId>
|
||||
@@ -140,6 +161,12 @@
|
||||
<version>${commons.compress.version}</version>
|
||||
<scope>compile</scope>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>com.github.xkzhangsan</groupId>
|
||||
<artifactId>xk-time</artifactId>
|
||||
<version>${xk.time.version}</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
|
||||
|
||||
@@ -0,0 +1,158 @@
|
||||
package com.tencent.supersonic.headless.core.chat.corrector;
|
||||
|
||||
import com.tencent.supersonic.common.pojo.enums.AggregateTypeEnum;
|
||||
import com.tencent.supersonic.common.pojo.enums.TimeDimensionEnum;
|
||||
import com.tencent.supersonic.common.util.ContextUtils;
|
||||
import com.tencent.supersonic.common.util.jsqlparser.SqlAddHelper;
|
||||
import com.tencent.supersonic.common.util.jsqlparser.SqlSelectFunctionHelper;
|
||||
import com.tencent.supersonic.common.util.jsqlparser.SqlSelectHelper;
|
||||
import com.tencent.supersonic.headless.api.pojo.SchemaElement;
|
||||
import com.tencent.supersonic.headless.api.pojo.SemanticParseInfo;
|
||||
import com.tencent.supersonic.headless.api.pojo.SemanticSchema;
|
||||
import com.tencent.supersonic.headless.core.pojo.QueryContext;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.commons.lang3.tuple.Pair;
|
||||
import org.springframework.core.env.Environment;
|
||||
import org.springframework.util.CollectionUtils;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/**
|
||||
* basic semantic correction functionality, offering common methods and an
|
||||
* abstract method called doCorrect
|
||||
*/
|
||||
@Slf4j
|
||||
public abstract class BaseSemanticCorrector implements SemanticCorrector {
|
||||
|
||||
public void correct(QueryContext queryContext, SemanticParseInfo semanticParseInfo) {
|
||||
try {
|
||||
if (StringUtils.isBlank(semanticParseInfo.getSqlInfo().getCorrectS2SQL())) {
|
||||
return;
|
||||
}
|
||||
doCorrect(queryContext, semanticParseInfo);
|
||||
log.info("sqlCorrection:{} sql:{}", this.getClass().getSimpleName(), semanticParseInfo.getSqlInfo());
|
||||
} catch (Exception e) {
|
||||
log.error(String.format("correct error,sqlInfo:%s", semanticParseInfo.getSqlInfo()), e);
|
||||
}
|
||||
}
|
||||
|
||||
public abstract void doCorrect(QueryContext queryContext, SemanticParseInfo semanticParseInfo);
|
||||
|
||||
protected Map<String, String> getFieldNameMap(QueryContext queryContext, Long dataSetId) {
|
||||
|
||||
SemanticSchema semanticSchema = queryContext.getSemanticSchema();
|
||||
|
||||
List<SchemaElement> dbAllFields = new ArrayList<>();
|
||||
dbAllFields.addAll(semanticSchema.getMetrics());
|
||||
dbAllFields.addAll(semanticSchema.getDimensions());
|
||||
|
||||
// support fieldName and field alias
|
||||
Map<String, String> result = dbAllFields.stream()
|
||||
.filter(entry -> dataSetId.equals(entry.getDataSet()))
|
||||
.flatMap(schemaElement -> {
|
||||
Set<String> elements = new HashSet<>();
|
||||
elements.add(schemaElement.getName());
|
||||
if (!CollectionUtils.isEmpty(schemaElement.getAlias())) {
|
||||
elements.addAll(schemaElement.getAlias());
|
||||
}
|
||||
return elements.stream();
|
||||
})
|
||||
.collect(Collectors.toMap(a -> a, a -> a, (k1, k2) -> k1));
|
||||
result.put(TimeDimensionEnum.DAY.getChName(), TimeDimensionEnum.DAY.getChName());
|
||||
result.put(TimeDimensionEnum.MONTH.getChName(), TimeDimensionEnum.MONTH.getChName());
|
||||
result.put(TimeDimensionEnum.WEEK.getChName(), TimeDimensionEnum.WEEK.getChName());
|
||||
|
||||
result.put(TimeDimensionEnum.DAY.getName(), TimeDimensionEnum.DAY.getChName());
|
||||
result.put(TimeDimensionEnum.MONTH.getName(), TimeDimensionEnum.MONTH.getChName());
|
||||
result.put(TimeDimensionEnum.WEEK.getName(), TimeDimensionEnum.WEEK.getChName());
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
protected void addFieldsToSelect(SemanticParseInfo semanticParseInfo, String correctS2SQL) {
|
||||
Set<String> selectFields = new HashSet<>(SqlSelectHelper.getSelectFields(correctS2SQL));
|
||||
Set<String> needAddFields = new HashSet<>(SqlSelectHelper.getGroupByFields(correctS2SQL));
|
||||
|
||||
//decide whether add order by expression field to select
|
||||
Environment environment = ContextUtils.getBean(Environment.class);
|
||||
String correctorAdditionalInfo = environment.getProperty("corrector.additional.information");
|
||||
if (StringUtils.isNotBlank(correctorAdditionalInfo) && Boolean.parseBoolean(correctorAdditionalInfo)) {
|
||||
needAddFields.addAll(SqlSelectHelper.getOrderByFields(correctS2SQL));
|
||||
}
|
||||
|
||||
// If there is no aggregate function in the S2SQL statement and
|
||||
// there is a data field in 'WHERE' statement, add the field to the 'SELECT' statement.
|
||||
if (!SqlSelectFunctionHelper.hasAggregateFunction(correctS2SQL)) {
|
||||
List<String> whereFields = SqlSelectHelper.getWhereFields(correctS2SQL);
|
||||
List<String> timeChNameList = TimeDimensionEnum.getChNameList();
|
||||
Set<String> timeFields = whereFields.stream().filter(field -> timeChNameList.contains(field))
|
||||
.collect(Collectors.toSet());
|
||||
needAddFields.addAll(timeFields);
|
||||
}
|
||||
|
||||
if (CollectionUtils.isEmpty(selectFields) || CollectionUtils.isEmpty(needAddFields)) {
|
||||
return;
|
||||
}
|
||||
|
||||
needAddFields.removeAll(selectFields);
|
||||
String replaceFields = SqlAddHelper.addFieldsToSelect(correctS2SQL, new ArrayList<>(needAddFields));
|
||||
semanticParseInfo.getSqlInfo().setCorrectS2SQL(replaceFields);
|
||||
}
|
||||
|
||||
protected void addAggregateToMetric(QueryContext queryContext, SemanticParseInfo semanticParseInfo) {
|
||||
//add aggregate to all metric
|
||||
String correctS2SQL = semanticParseInfo.getSqlInfo().getCorrectS2SQL();
|
||||
Long dataSetId = semanticParseInfo.getDataSet().getDataSet();
|
||||
List<SchemaElement> metrics = getMetricElements(queryContext, dataSetId);
|
||||
|
||||
Map<String, String> metricToAggregate = metrics.stream()
|
||||
.map(schemaElement -> {
|
||||
if (Objects.isNull(schemaElement.getDefaultAgg())) {
|
||||
schemaElement.setDefaultAgg(AggregateTypeEnum.SUM.name());
|
||||
}
|
||||
return schemaElement;
|
||||
}).flatMap(schemaElement -> {
|
||||
Set<String> elements = new HashSet<>();
|
||||
elements.add(schemaElement.getName());
|
||||
if (!CollectionUtils.isEmpty(schemaElement.getAlias())) {
|
||||
elements.addAll(schemaElement.getAlias());
|
||||
}
|
||||
return elements.stream().map(element -> Pair.of(element, schemaElement.getDefaultAgg())
|
||||
);
|
||||
}).collect(Collectors.toMap(Pair::getLeft, Pair::getRight, (k1, k2) -> k1));
|
||||
|
||||
if (CollectionUtils.isEmpty(metricToAggregate)) {
|
||||
return;
|
||||
}
|
||||
String aggregateSql = SqlAddHelper.addAggregateToField(correctS2SQL, metricToAggregate);
|
||||
semanticParseInfo.getSqlInfo().setCorrectS2SQL(aggregateSql);
|
||||
}
|
||||
|
||||
protected List<SchemaElement> getMetricElements(QueryContext queryContext, Long dataSetId) {
|
||||
SemanticSchema semanticSchema = queryContext.getSemanticSchema();
|
||||
return semanticSchema.getMetrics(dataSetId);
|
||||
}
|
||||
|
||||
protected Set<String> getDimensions(Long dataSetId, SemanticSchema semanticSchema) {
|
||||
Set<String> dimensions = semanticSchema.getDimensions(dataSetId).stream()
|
||||
.flatMap(
|
||||
schemaElement -> {
|
||||
Set<String> elements = new HashSet<>();
|
||||
elements.add(schemaElement.getName());
|
||||
if (!CollectionUtils.isEmpty(schemaElement.getAlias())) {
|
||||
elements.addAll(schemaElement.getAlias());
|
||||
}
|
||||
return elements.stream();
|
||||
}
|
||||
).collect(Collectors.toSet());
|
||||
dimensions.add(TimeDimensionEnum.DAY.getChName());
|
||||
return dimensions;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,32 @@
|
||||
package com.tencent.supersonic.headless.core.chat.corrector;
|
||||
|
||||
import com.tencent.supersonic.headless.api.pojo.SemanticParseInfo;
|
||||
import com.tencent.supersonic.headless.core.pojo.QueryContext;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Correcting SQL syntax, primarily including fixes to select, where, groupBy, and Having clauses
|
||||
*/
|
||||
@Slf4j
|
||||
public class GrammarCorrector extends BaseSemanticCorrector {
|
||||
|
||||
private List<BaseSemanticCorrector> correctors;
|
||||
|
||||
public GrammarCorrector() {
|
||||
correctors = new ArrayList<>();
|
||||
correctors.add(new SelectCorrector());
|
||||
correctors.add(new WhereCorrector());
|
||||
correctors.add(new GroupByCorrector());
|
||||
correctors.add(new HavingCorrector());
|
||||
}
|
||||
|
||||
@Override
|
||||
public void doCorrect(QueryContext queryContext, SemanticParseInfo semanticParseInfo) {
|
||||
for (BaseSemanticCorrector corrector : correctors) {
|
||||
corrector.correct(queryContext, semanticParseInfo);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,90 @@
|
||||
package com.tencent.supersonic.headless.core.chat.corrector;
|
||||
|
||||
import com.tencent.supersonic.common.pojo.enums.TimeDimensionEnum;
|
||||
import com.tencent.supersonic.common.util.jsqlparser.SqlAddHelper;
|
||||
import com.tencent.supersonic.common.util.jsqlparser.SqlSelectHelper;
|
||||
import com.tencent.supersonic.headless.api.pojo.SemanticParseInfo;
|
||||
import com.tencent.supersonic.headless.api.pojo.SemanticSchema;
|
||||
import com.tencent.supersonic.headless.api.pojo.SqlInfo;
|
||||
import com.tencent.supersonic.headless.core.pojo.QueryContext;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.springframework.util.CollectionUtils;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/**
|
||||
* Perform SQL corrections on the "Group by" section in S2SQL.
|
||||
*/
|
||||
@Slf4j
|
||||
public class GroupByCorrector extends BaseSemanticCorrector {
|
||||
|
||||
@Override
|
||||
public void doCorrect(QueryContext queryContext, SemanticParseInfo semanticParseInfo) {
|
||||
Boolean needAddGroupBy = needAddGroupBy(queryContext, semanticParseInfo);
|
||||
if (!needAddGroupBy) {
|
||||
return;
|
||||
}
|
||||
addGroupByFields(queryContext, semanticParseInfo);
|
||||
}
|
||||
|
||||
private Boolean needAddGroupBy(QueryContext queryContext, SemanticParseInfo semanticParseInfo) {
|
||||
Long dataSetId = semanticParseInfo.getDataSetId();
|
||||
//add dimension group by
|
||||
SqlInfo sqlInfo = semanticParseInfo.getSqlInfo();
|
||||
String correctS2SQL = sqlInfo.getCorrectS2SQL();
|
||||
SemanticSchema semanticSchema = queryContext.getSemanticSchema();
|
||||
// check has distinct
|
||||
if (SqlSelectHelper.hasDistinct(correctS2SQL)) {
|
||||
log.info("not add group by ,exist distinct in correctS2SQL:{}", correctS2SQL);
|
||||
return false;
|
||||
}
|
||||
//add alias field name
|
||||
Set<String> dimensions = getDimensions(dataSetId, semanticSchema);
|
||||
List<String> selectFields = SqlSelectHelper.getSelectFields(correctS2SQL);
|
||||
if (CollectionUtils.isEmpty(selectFields) || CollectionUtils.isEmpty(dimensions)) {
|
||||
return false;
|
||||
}
|
||||
// if only date in select not add group by.
|
||||
if (selectFields.size() == 1 && selectFields.contains(TimeDimensionEnum.DAY.getChName())) {
|
||||
return false;
|
||||
}
|
||||
if (SqlSelectHelper.hasGroupBy(correctS2SQL)) {
|
||||
log.info("not add group by ,exist group by in correctS2SQL:{}", correctS2SQL);
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private void addGroupByFields(QueryContext queryContext, SemanticParseInfo semanticParseInfo) {
|
||||
Long dataSetId = semanticParseInfo.getDataSetId();
|
||||
//add dimension group by
|
||||
SqlInfo sqlInfo = semanticParseInfo.getSqlInfo();
|
||||
String correctS2SQL = sqlInfo.getCorrectS2SQL();
|
||||
SemanticSchema semanticSchema = queryContext.getSemanticSchema();
|
||||
//add alias field name
|
||||
Set<String> dimensions = getDimensions(dataSetId, semanticSchema);
|
||||
List<String> selectFields = SqlSelectHelper.getSelectFields(correctS2SQL);
|
||||
List<String> aggregateFields = SqlSelectHelper.getAggregateFields(correctS2SQL);
|
||||
Set<String> groupByFields = selectFields.stream()
|
||||
.filter(field -> dimensions.contains(field))
|
||||
.filter(field -> {
|
||||
if (!CollectionUtils.isEmpty(aggregateFields) && aggregateFields.contains(field)) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
})
|
||||
.collect(Collectors.toSet());
|
||||
semanticParseInfo.getSqlInfo().setCorrectS2SQL(SqlAddHelper.addGroupBy(correctS2SQL, groupByFields));
|
||||
addAggregate(queryContext, semanticParseInfo);
|
||||
}
|
||||
|
||||
private void addAggregate(QueryContext queryContext, SemanticParseInfo semanticParseInfo) {
|
||||
List<String> sqlGroupByFields = SqlSelectHelper.getGroupByFields(
|
||||
semanticParseInfo.getSqlInfo().getCorrectS2SQL());
|
||||
if (CollectionUtils.isEmpty(sqlGroupByFields)) {
|
||||
return;
|
||||
}
|
||||
addAggregateToMetric(queryContext, semanticParseInfo);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,69 @@
|
||||
package com.tencent.supersonic.headless.core.chat.corrector;
|
||||
|
||||
import com.tencent.supersonic.common.util.ContextUtils;
|
||||
import com.tencent.supersonic.common.util.jsqlparser.SqlAddHelper;
|
||||
import com.tencent.supersonic.common.util.jsqlparser.SqlSelectFunctionHelper;
|
||||
import com.tencent.supersonic.common.util.jsqlparser.SqlSelectHelper;
|
||||
import com.tencent.supersonic.headless.api.pojo.SemanticParseInfo;
|
||||
import com.tencent.supersonic.headless.api.pojo.SemanticSchema;
|
||||
import com.tencent.supersonic.headless.core.pojo.QueryContext;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import net.sf.jsqlparser.expression.Expression;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.springframework.core.env.Environment;
|
||||
import org.springframework.util.CollectionUtils;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/**
|
||||
* Perform SQL corrections on the "Having" section in S2SQL.
|
||||
*/
|
||||
@Slf4j
|
||||
public class HavingCorrector extends BaseSemanticCorrector {
|
||||
|
||||
@Override
|
||||
public void doCorrect(QueryContext queryContext, SemanticParseInfo semanticParseInfo) {
|
||||
|
||||
//add aggregate to all metric
|
||||
addHaving(queryContext, semanticParseInfo);
|
||||
|
||||
//decide whether add having expression field to select
|
||||
Environment environment = ContextUtils.getBean(Environment.class);
|
||||
String correctorAdditionalInfo = environment.getProperty("corrector.additional.information");
|
||||
if (StringUtils.isNotBlank(correctorAdditionalInfo) && Boolean.parseBoolean(correctorAdditionalInfo)) {
|
||||
addHavingToSelect(semanticParseInfo);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private void addHaving(QueryContext queryContext, SemanticParseInfo semanticParseInfo) {
|
||||
Long dataSet = semanticParseInfo.getDataSet().getDataSet();
|
||||
|
||||
SemanticSchema semanticSchema = queryContext.getSemanticSchema();
|
||||
|
||||
Set<String> metrics = semanticSchema.getMetrics(dataSet).stream()
|
||||
.map(schemaElement -> schemaElement.getName()).collect(Collectors.toSet());
|
||||
|
||||
if (CollectionUtils.isEmpty(metrics)) {
|
||||
return;
|
||||
}
|
||||
String havingSql = SqlAddHelper.addHaving(semanticParseInfo.getSqlInfo().getCorrectS2SQL(), metrics);
|
||||
semanticParseInfo.getSqlInfo().setCorrectS2SQL(havingSql);
|
||||
}
|
||||
|
||||
private void addHavingToSelect(SemanticParseInfo semanticParseInfo) {
|
||||
String correctS2SQL = semanticParseInfo.getSqlInfo().getCorrectS2SQL();
|
||||
if (!SqlSelectFunctionHelper.hasAggregateFunction(correctS2SQL)) {
|
||||
return;
|
||||
}
|
||||
List<Expression> havingExpressionList = SqlSelectHelper.getHavingExpression(correctS2SQL);
|
||||
if (!CollectionUtils.isEmpty(havingExpressionList)) {
|
||||
String replaceSql = SqlAddHelper.addFunctionToSelect(correctS2SQL, havingExpressionList);
|
||||
semanticParseInfo.getSqlInfo().setCorrectS2SQL(replaceSql);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,150 @@
|
||||
package com.tencent.supersonic.headless.core.chat.corrector;
|
||||
|
||||
import com.tencent.supersonic.common.pojo.Constants;
|
||||
import com.tencent.supersonic.common.pojo.enums.FilterOperatorEnum;
|
||||
import com.tencent.supersonic.common.pojo.enums.TimeDimensionEnum;
|
||||
import com.tencent.supersonic.common.util.DateUtils;
|
||||
import com.tencent.supersonic.common.util.JsonUtil;
|
||||
import com.tencent.supersonic.common.util.jsqlparser.AggregateEnum;
|
||||
import com.tencent.supersonic.common.util.jsqlparser.FieldExpression;
|
||||
import com.tencent.supersonic.common.util.jsqlparser.SqlRemoveHelper;
|
||||
import com.tencent.supersonic.common.util.jsqlparser.SqlReplaceHelper;
|
||||
import com.tencent.supersonic.common.util.jsqlparser.SqlSelectHelper;
|
||||
import com.tencent.supersonic.headless.api.pojo.SemanticParseInfo;
|
||||
import com.tencent.supersonic.headless.api.pojo.SemanticSchema;
|
||||
import com.tencent.supersonic.headless.api.pojo.SqlInfo;
|
||||
import com.tencent.supersonic.headless.core.chat.parser.llm.ParseResult;
|
||||
import com.tencent.supersonic.headless.core.pojo.QueryContext;
|
||||
import com.tencent.supersonic.headless.core.chat.query.llm.s2sql.LLMReq.ElementValue;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.springframework.util.CollectionUtils;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/**
|
||||
* Perform schema corrections on the Schema information in S2SQL.
|
||||
*/
|
||||
@Slf4j
|
||||
public class SchemaCorrector extends BaseSemanticCorrector {
|
||||
|
||||
@Override
|
||||
public void doCorrect(QueryContext queryContext, SemanticParseInfo semanticParseInfo) {
|
||||
|
||||
removeFilterIfNotInLinkingValue(queryContext, semanticParseInfo);
|
||||
|
||||
correctAggFunction(semanticParseInfo);
|
||||
|
||||
replaceAlias(semanticParseInfo);
|
||||
|
||||
updateFieldNameByLinkingValue(semanticParseInfo);
|
||||
|
||||
updateFieldValueByLinkingValue(semanticParseInfo);
|
||||
|
||||
correctFieldName(queryContext, semanticParseInfo);
|
||||
}
|
||||
|
||||
private void correctAggFunction(SemanticParseInfo semanticParseInfo) {
|
||||
Map<String, String> aggregateEnum = AggregateEnum.getAggregateEnum();
|
||||
SqlInfo sqlInfo = semanticParseInfo.getSqlInfo();
|
||||
String sql = SqlReplaceHelper.replaceFunction(sqlInfo.getCorrectS2SQL(), aggregateEnum);
|
||||
sqlInfo.setCorrectS2SQL(sql);
|
||||
}
|
||||
|
||||
private void replaceAlias(SemanticParseInfo semanticParseInfo) {
|
||||
SqlInfo sqlInfo = semanticParseInfo.getSqlInfo();
|
||||
String replaceAlias = SqlReplaceHelper.replaceAlias(sqlInfo.getCorrectS2SQL());
|
||||
sqlInfo.setCorrectS2SQL(replaceAlias);
|
||||
}
|
||||
|
||||
private void correctFieldName(QueryContext queryContext, SemanticParseInfo semanticParseInfo) {
|
||||
Map<String, String> fieldNameMap = getFieldNameMap(queryContext, semanticParseInfo.getDataSetId());
|
||||
SqlInfo sqlInfo = semanticParseInfo.getSqlInfo();
|
||||
String sql = SqlReplaceHelper.replaceFields(sqlInfo.getCorrectS2SQL(), fieldNameMap);
|
||||
sqlInfo.setCorrectS2SQL(sql);
|
||||
}
|
||||
|
||||
private void updateFieldNameByLinkingValue(SemanticParseInfo semanticParseInfo) {
|
||||
List<ElementValue> linking = getLinkingValues(semanticParseInfo);
|
||||
if (CollectionUtils.isEmpty(linking)) {
|
||||
return;
|
||||
}
|
||||
|
||||
Map<String, Set<String>> fieldValueToFieldNames = linking.stream().collect(
|
||||
Collectors.groupingBy(ElementValue::getFieldValue,
|
||||
Collectors.mapping(ElementValue::getFieldName, Collectors.toSet())));
|
||||
|
||||
SqlInfo sqlInfo = semanticParseInfo.getSqlInfo();
|
||||
|
||||
String sql = SqlReplaceHelper.replaceFieldNameByValue(sqlInfo.getCorrectS2SQL(), fieldValueToFieldNames);
|
||||
sqlInfo.setCorrectS2SQL(sql);
|
||||
}
|
||||
|
||||
private List<ElementValue> getLinkingValues(SemanticParseInfo semanticParseInfo) {
|
||||
Object context = semanticParseInfo.getProperties().get(Constants.CONTEXT);
|
||||
if (Objects.isNull(context)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
ParseResult parseResult = JsonUtil.toObject(JsonUtil.toString(context), ParseResult.class);
|
||||
if (Objects.isNull(parseResult) || Objects.isNull(parseResult.getLlmReq())) {
|
||||
return null;
|
||||
}
|
||||
return parseResult.getLinkingValues();
|
||||
}
|
||||
|
||||
private void updateFieldValueByLinkingValue(SemanticParseInfo semanticParseInfo) {
|
||||
List<ElementValue> linking = getLinkingValues(semanticParseInfo);
|
||||
if (CollectionUtils.isEmpty(linking)) {
|
||||
return;
|
||||
}
|
||||
|
||||
Map<String, Map<String, String>> filedNameToValueMap = linking.stream().collect(
|
||||
Collectors.groupingBy(ElementValue::getFieldName,
|
||||
Collectors.mapping(ElementValue::getFieldValue, Collectors.toMap(
|
||||
oldValue -> oldValue,
|
||||
newValue -> newValue,
|
||||
(existingValue, newValue) -> newValue)
|
||||
)));
|
||||
|
||||
SqlInfo sqlInfo = semanticParseInfo.getSqlInfo();
|
||||
String sql = SqlReplaceHelper.replaceValue(sqlInfo.getCorrectS2SQL(), filedNameToValueMap, false);
|
||||
sqlInfo.setCorrectS2SQL(sql);
|
||||
}
|
||||
|
||||
public void removeFilterIfNotInLinkingValue(QueryContext queryContext, SemanticParseInfo semanticParseInfo) {
|
||||
SqlInfo sqlInfo = semanticParseInfo.getSqlInfo();
|
||||
String correctS2SQL = sqlInfo.getCorrectS2SQL();
|
||||
List<FieldExpression> whereExpressionList = SqlSelectHelper.getWhereExpressions(correctS2SQL);
|
||||
if (CollectionUtils.isEmpty(whereExpressionList)) {
|
||||
return;
|
||||
}
|
||||
List<ElementValue> linkingValues = getLinkingValues(semanticParseInfo);
|
||||
SemanticSchema semanticSchema = queryContext.getSemanticSchema();
|
||||
Set<String> dimensions = getDimensions(semanticParseInfo.getDataSetId(), semanticSchema);
|
||||
|
||||
if (CollectionUtils.isEmpty(linkingValues)) {
|
||||
linkingValues = new ArrayList<>();
|
||||
}
|
||||
Set<String> linkingFieldNames = linkingValues.stream().map(linking -> linking.getFieldName())
|
||||
.collect(Collectors.toSet());
|
||||
|
||||
Set<String> removeFieldNames = whereExpressionList.stream()
|
||||
.filter(fieldExpression -> StringUtils.isBlank(fieldExpression.getFunction()))
|
||||
.filter(fieldExpression -> !TimeDimensionEnum.containsTimeDimension(fieldExpression.getFieldName()))
|
||||
.filter(fieldExpression -> FilterOperatorEnum.EQUALS.getValue().equals(fieldExpression.getOperator()))
|
||||
.filter(fieldExpression -> dimensions.contains(fieldExpression.getFieldName()))
|
||||
.filter(fieldExpression -> !DateUtils.isAnyDateString(fieldExpression.getFieldValue().toString()))
|
||||
.filter(fieldExpression -> !linkingFieldNames.contains(fieldExpression.getFieldName()))
|
||||
.map(fieldExpression -> fieldExpression.getFieldName()).collect(Collectors.toSet());
|
||||
|
||||
String sql = SqlRemoveHelper.removeWhereCondition(correctS2SQL, removeFieldNames);
|
||||
sqlInfo.setCorrectS2SQL(sql);
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,31 @@
|
||||
package com.tencent.supersonic.headless.core.chat.corrector;
|
||||
|
||||
|
||||
import com.tencent.supersonic.common.util.jsqlparser.SqlSelectHelper;
|
||||
import com.tencent.supersonic.headless.api.pojo.SemanticParseInfo;
|
||||
import com.tencent.supersonic.headless.core.pojo.QueryContext;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.springframework.util.CollectionUtils;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Perform SQL corrections on the "Select" section in S2SQL.
|
||||
*/
|
||||
@Slf4j
|
||||
public class SelectCorrector extends BaseSemanticCorrector {
|
||||
|
||||
@Override
|
||||
public void doCorrect(QueryContext queryContext, SemanticParseInfo semanticParseInfo) {
|
||||
String correctS2SQL = semanticParseInfo.getSqlInfo().getCorrectS2SQL();
|
||||
List<String> aggregateFields = SqlSelectHelper.getAggregateFields(correctS2SQL);
|
||||
List<String> selectFields = SqlSelectHelper.getSelectFields(correctS2SQL);
|
||||
// If the number of aggregated fields is equal to the number of queried fields, do not add fields to select.
|
||||
if (!CollectionUtils.isEmpty(aggregateFields)
|
||||
&& !CollectionUtils.isEmpty(selectFields)
|
||||
&& aggregateFields.size() == selectFields.size()) {
|
||||
return;
|
||||
}
|
||||
addFieldsToSelect(semanticParseInfo, correctS2SQL);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,14 @@
|
||||
package com.tencent.supersonic.headless.core.chat.corrector;
|
||||
|
||||
|
||||
import com.tencent.supersonic.headless.api.pojo.SemanticParseInfo;
|
||||
import com.tencent.supersonic.headless.core.pojo.QueryContext;
|
||||
|
||||
/**
|
||||
* A semantic corrector checks validity of extracted semantic information and
|
||||
* performs correction and optimization if needed.
|
||||
*/
|
||||
public interface SemanticCorrector {
|
||||
|
||||
void correct(QueryContext queryContext, SemanticParseInfo semanticParseInfo);
|
||||
}
|
||||
@@ -0,0 +1,59 @@
|
||||
package com.tencent.supersonic.headless.core.chat.corrector;
|
||||
|
||||
|
||||
import com.tencent.supersonic.common.util.jsqlparser.DateVisitor.DateBoundInfo;
|
||||
import com.tencent.supersonic.common.util.jsqlparser.SqlAddHelper;
|
||||
import com.tencent.supersonic.common.util.jsqlparser.SqlDateSelectHelper;
|
||||
import com.tencent.supersonic.common.util.jsqlparser.SqlReplaceHelper;
|
||||
import com.tencent.supersonic.headless.api.pojo.SemanticParseInfo;
|
||||
import com.tencent.supersonic.headless.core.pojo.QueryContext;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import net.sf.jsqlparser.JSQLParserException;
|
||||
import net.sf.jsqlparser.parser.CCJSqlParserUtil;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import java.util.Objects;
|
||||
|
||||
/**
|
||||
* Perform SQL corrections on the time in S2SQL.
|
||||
*/
|
||||
@Slf4j
|
||||
public class TimeCorrector extends BaseSemanticCorrector {
|
||||
|
||||
@Override
|
||||
public void doCorrect(QueryContext queryContext, SemanticParseInfo semanticParseInfo) {
|
||||
|
||||
parserDateDiffFunction(semanticParseInfo);
|
||||
|
||||
addLowerBoundDate(semanticParseInfo);
|
||||
|
||||
}
|
||||
|
||||
private void addLowerBoundDate(SemanticParseInfo semanticParseInfo) {
|
||||
String correctS2SQL = semanticParseInfo.getSqlInfo().getCorrectS2SQL();
|
||||
DateBoundInfo dateBoundInfo = SqlDateSelectHelper.getDateBoundInfo(correctS2SQL);
|
||||
if (Objects.isNull(dateBoundInfo)) {
|
||||
return;
|
||||
}
|
||||
if (StringUtils.isBlank(dateBoundInfo.getLowerBound())
|
||||
&& StringUtils.isNotBlank(dateBoundInfo.getUpperBound())
|
||||
&& StringUtils.isNotBlank(dateBoundInfo.getUpperDate())) {
|
||||
String upperDate = dateBoundInfo.getUpperDate();
|
||||
try {
|
||||
correctS2SQL = SqlAddHelper.addParenthesisToWhere(correctS2SQL);
|
||||
String condExpr = dateBoundInfo.getColumName() + " >= '" + upperDate + "'";
|
||||
correctS2SQL = SqlAddHelper.addWhere(correctS2SQL, CCJSqlParserUtil.parseCondExpression(condExpr));
|
||||
} catch (JSQLParserException e) {
|
||||
log.error("parseCondExpression", e);
|
||||
}
|
||||
semanticParseInfo.getSqlInfo().setCorrectS2SQL(correctS2SQL);
|
||||
}
|
||||
}
|
||||
|
||||
private void parserDateDiffFunction(SemanticParseInfo semanticParseInfo) {
|
||||
String correctS2SQL = semanticParseInfo.getSqlInfo().getCorrectS2SQL();
|
||||
correctS2SQL = SqlReplaceHelper.replaceFunction(correctS2SQL);
|
||||
semanticParseInfo.getSqlInfo().setCorrectS2SQL(correctS2SQL);
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,158 @@
|
||||
package com.tencent.supersonic.headless.core.chat.corrector;
|
||||
|
||||
|
||||
import com.tencent.supersonic.common.pojo.Constants;
|
||||
import com.tencent.supersonic.common.pojo.enums.TimeDimensionEnum;
|
||||
import com.tencent.supersonic.common.util.StringUtil;
|
||||
import com.tencent.supersonic.common.util.jsqlparser.SqlAddHelper;
|
||||
import com.tencent.supersonic.common.util.jsqlparser.SqlReplaceHelper;
|
||||
import com.tencent.supersonic.common.util.jsqlparser.SqlSelectHelper;
|
||||
import com.tencent.supersonic.headless.api.pojo.SchemaElement;
|
||||
import com.tencent.supersonic.headless.api.pojo.SchemaValueMap;
|
||||
import com.tencent.supersonic.headless.api.pojo.SemanticParseInfo;
|
||||
import com.tencent.supersonic.headless.api.pojo.SemanticSchema;
|
||||
import com.tencent.supersonic.headless.api.pojo.request.QueryFilters;
|
||||
import com.tencent.supersonic.headless.core.pojo.QueryContext;
|
||||
import com.tencent.supersonic.headless.core.utils.S2SqlDateHelper;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import net.sf.jsqlparser.JSQLParserException;
|
||||
import net.sf.jsqlparser.expression.Expression;
|
||||
import net.sf.jsqlparser.parser.CCJSqlParserUtil;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.commons.lang3.tuple.Pair;
|
||||
import org.apache.logging.log4j.util.Strings;
|
||||
import org.springframework.util.CollectionUtils;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/**
|
||||
* Perform SQL corrections on the "Where" section in S2SQL.
|
||||
*/
|
||||
@Slf4j
|
||||
public class WhereCorrector extends BaseSemanticCorrector {
|
||||
|
||||
@Override
|
||||
public void doCorrect(QueryContext queryContext, SemanticParseInfo semanticParseInfo) {
|
||||
|
||||
addDateIfNotExist(queryContext, semanticParseInfo);
|
||||
|
||||
addQueryFilter(queryContext, semanticParseInfo);
|
||||
|
||||
updateFieldValueByTechName(queryContext, semanticParseInfo);
|
||||
}
|
||||
|
||||
private void addQueryFilter(QueryContext queryContext, SemanticParseInfo semanticParseInfo) {
|
||||
String queryFilter = getQueryFilter(queryContext.getQueryFilters());
|
||||
|
||||
String correctS2SQL = semanticParseInfo.getSqlInfo().getCorrectS2SQL();
|
||||
|
||||
if (StringUtils.isNotEmpty(queryFilter)) {
|
||||
log.info("add queryFilter to correctS2SQL :{}", queryFilter);
|
||||
Expression expression = null;
|
||||
try {
|
||||
expression = CCJSqlParserUtil.parseCondExpression(queryFilter);
|
||||
} catch (JSQLParserException e) {
|
||||
log.error("parseCondExpression", e);
|
||||
}
|
||||
correctS2SQL = SqlAddHelper.addWhere(correctS2SQL, expression);
|
||||
semanticParseInfo.getSqlInfo().setCorrectS2SQL(correctS2SQL);
|
||||
}
|
||||
}
|
||||
|
||||
private void addDateIfNotExist(QueryContext queryContext, SemanticParseInfo semanticParseInfo) {
|
||||
String correctS2SQL = semanticParseInfo.getSqlInfo().getCorrectS2SQL();
|
||||
List<String> whereFields = SqlSelectHelper.getWhereFields(correctS2SQL);
|
||||
if (CollectionUtils.isEmpty(whereFields) || !TimeDimensionEnum.containsZhTimeDimension(whereFields)) {
|
||||
|
||||
Pair<String, String> startEndDate = S2SqlDateHelper.getStartEndDate(queryContext,
|
||||
semanticParseInfo.getDataSetId(), semanticParseInfo.getQueryType());
|
||||
|
||||
if (StringUtils.isNotBlank(startEndDate.getLeft())
|
||||
&& StringUtils.isNotBlank(startEndDate.getRight())) {
|
||||
correctS2SQL = SqlAddHelper.addParenthesisToWhere(correctS2SQL);
|
||||
String dateChName = TimeDimensionEnum.DAY.getChName();
|
||||
String condExpr = String.format(" ( %s >= '%s' and %s <= '%s' )", dateChName,
|
||||
startEndDate.getLeft(), dateChName, startEndDate.getRight());
|
||||
try {
|
||||
Expression expression = CCJSqlParserUtil.parseCondExpression(condExpr);
|
||||
correctS2SQL = SqlAddHelper.addWhere(correctS2SQL, expression);
|
||||
} catch (JSQLParserException e) {
|
||||
log.error("parseCondExpression:{}", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
semanticParseInfo.getSqlInfo().setCorrectS2SQL(correctS2SQL);
|
||||
}
|
||||
|
||||
private String getQueryFilter(QueryFilters queryFilters) {
|
||||
if (Objects.isNull(queryFilters) || CollectionUtils.isEmpty(queryFilters.getFilters())) {
|
||||
return null;
|
||||
}
|
||||
return queryFilters.getFilters().stream()
|
||||
.map(filter -> {
|
||||
String bizNameWrap = StringUtil.getSpaceWrap(filter.getName());
|
||||
String operatorWrap = StringUtil.getSpaceWrap(filter.getOperator().getValue());
|
||||
String valueWrap = StringUtil.getCommaWrap(filter.getValue().toString());
|
||||
return bizNameWrap + operatorWrap + valueWrap;
|
||||
})
|
||||
.collect(Collectors.joining(Constants.AND_UPPER));
|
||||
}
|
||||
|
||||
private void updateFieldValueByTechName(QueryContext queryContext, SemanticParseInfo semanticParseInfo) {
|
||||
SemanticSchema semanticSchema = queryContext.getSemanticSchema();
|
||||
Long dataSetId = semanticParseInfo.getDataSetId();
|
||||
List<SchemaElement> dimensions = semanticSchema.getDimensions(dataSetId);
|
||||
|
||||
if (CollectionUtils.isEmpty(dimensions)) {
|
||||
return;
|
||||
}
|
||||
|
||||
Map<String, Map<String, String>> aliasAndBizNameToTechName = getAliasAndBizNameToTechName(dimensions);
|
||||
String correctS2SQL = SqlReplaceHelper.replaceValue(semanticParseInfo.getSqlInfo().getCorrectS2SQL(),
|
||||
aliasAndBizNameToTechName);
|
||||
semanticParseInfo.getSqlInfo().setCorrectS2SQL(correctS2SQL);
|
||||
}
|
||||
|
||||
private Map<String, Map<String, String>> getAliasAndBizNameToTechName(List<SchemaElement> dimensions) {
|
||||
if (CollectionUtils.isEmpty(dimensions)) {
|
||||
return new HashMap<>();
|
||||
}
|
||||
|
||||
Map<String, Map<String, String>> result = new HashMap<>();
|
||||
|
||||
for (SchemaElement dimension : dimensions) {
|
||||
if (Objects.isNull(dimension)
|
||||
|| Strings.isEmpty(dimension.getName())
|
||||
|| CollectionUtils.isEmpty(dimension.getSchemaValueMaps())) {
|
||||
continue;
|
||||
}
|
||||
String name = dimension.getName();
|
||||
|
||||
Map<String, String> aliasAndBizNameToTechName = new HashMap<>();
|
||||
|
||||
for (SchemaValueMap valueMap : dimension.getSchemaValueMaps()) {
|
||||
if (Objects.isNull(valueMap) || Strings.isEmpty(valueMap.getTechName())) {
|
||||
continue;
|
||||
}
|
||||
if (Strings.isNotEmpty(valueMap.getBizName())) {
|
||||
aliasAndBizNameToTechName.put(valueMap.getBizName(), valueMap.getTechName());
|
||||
}
|
||||
if (!CollectionUtils.isEmpty(valueMap.getAlias())) {
|
||||
valueMap.getAlias().stream().forEach(alias -> {
|
||||
if (Strings.isNotEmpty(alias)) {
|
||||
aliasAndBizNameToTechName.put(alias, valueMap.getTechName());
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
if (!CollectionUtils.isEmpty(aliasAndBizNameToTechName)) {
|
||||
result.put(name, aliasAndBizNameToTechName);
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,100 @@
|
||||
package com.tencent.supersonic.headless.core.chat.mapper;
|
||||
|
||||
import com.tencent.supersonic.headless.api.pojo.DataSetSchema;
|
||||
import com.tencent.supersonic.headless.api.pojo.SchemaElement;
|
||||
import com.tencent.supersonic.headless.api.pojo.SchemaElementMatch;
|
||||
import com.tencent.supersonic.headless.api.pojo.SchemaElementType;
|
||||
import com.tencent.supersonic.headless.api.pojo.SchemaMapInfo;
|
||||
import com.tencent.supersonic.headless.api.pojo.SemanticSchema;
|
||||
import com.tencent.supersonic.headless.core.pojo.QueryContext;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.springframework.beans.BeanUtils;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import java.util.concurrent.atomic.AtomicBoolean;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@Slf4j
|
||||
public abstract class BaseMapper implements SchemaMapper {
|
||||
|
||||
@Override
|
||||
public void map(QueryContext queryContext) {
|
||||
|
||||
String simpleName = this.getClass().getSimpleName();
|
||||
long startTime = System.currentTimeMillis();
|
||||
log.debug("before {},mapInfo:{}", simpleName, queryContext.getMapInfo().getDataSetElementMatches());
|
||||
|
||||
try {
|
||||
doMap(queryContext);
|
||||
} catch (Exception e) {
|
||||
log.error("work error", e);
|
||||
}
|
||||
|
||||
long cost = System.currentTimeMillis() - startTime;
|
||||
log.debug("after {},cost:{},mapInfo:{}", simpleName, cost,
|
||||
queryContext.getMapInfo().getDataSetElementMatches());
|
||||
}
|
||||
|
||||
public abstract void doMap(QueryContext queryContext);
|
||||
|
||||
public void addToSchemaMap(SchemaMapInfo schemaMap, Long modelId, SchemaElementMatch newElementMatch) {
|
||||
Map<Long, List<SchemaElementMatch>> modelElementMatches = schemaMap.getDataSetElementMatches();
|
||||
List<SchemaElementMatch> schemaElementMatches = modelElementMatches.putIfAbsent(modelId, new ArrayList<>());
|
||||
if (schemaElementMatches == null) {
|
||||
schemaElementMatches = modelElementMatches.get(modelId);
|
||||
}
|
||||
//remove duplication
|
||||
AtomicBoolean needAddNew = new AtomicBoolean(true);
|
||||
schemaElementMatches.removeIf(
|
||||
existElementMatch -> {
|
||||
SchemaElement existElement = existElementMatch.getElement();
|
||||
SchemaElement newElement = newElementMatch.getElement();
|
||||
if (existElement.equals(newElement)) {
|
||||
if (newElementMatch.getSimilarity() > existElementMatch.getSimilarity()) {
|
||||
return true;
|
||||
} else {
|
||||
needAddNew.set(false);
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
);
|
||||
if (needAddNew.get()) {
|
||||
schemaElementMatches.add(newElementMatch);
|
||||
}
|
||||
}
|
||||
|
||||
public SchemaElement getSchemaElement(Long dataSetId, SchemaElementType elementType, Long elementID,
|
||||
SemanticSchema semanticSchema) {
|
||||
SchemaElement element = new SchemaElement();
|
||||
DataSetSchema dataSetSchema = semanticSchema.getDataSetSchemaMap().get(dataSetId);
|
||||
if (Objects.isNull(dataSetSchema)) {
|
||||
return null;
|
||||
}
|
||||
SchemaElement elementDb = dataSetSchema.getElement(elementType, elementID);
|
||||
if (Objects.isNull(elementDb)) {
|
||||
log.info("element is null, elementType:{},elementID:{}", elementType, elementID);
|
||||
return null;
|
||||
}
|
||||
BeanUtils.copyProperties(elementDb, element);
|
||||
element.setAlias(getAlias(elementDb));
|
||||
return element;
|
||||
}
|
||||
|
||||
public List<String> getAlias(SchemaElement element) {
|
||||
if (!SchemaElementType.VALUE.equals(element.getType())) {
|
||||
return element.getAlias();
|
||||
}
|
||||
if (org.apache.commons.collections.CollectionUtils.isNotEmpty(element.getAlias()) && StringUtils.isNotEmpty(
|
||||
element.getName())) {
|
||||
return element.getAlias().stream()
|
||||
.filter(aliasItem -> aliasItem.contains(element.getName()))
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
return element.getAlias();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,157 @@
|
||||
package com.tencent.supersonic.headless.core.chat.mapper;
|
||||
|
||||
|
||||
import com.tencent.supersonic.headless.api.pojo.response.S2Term;
|
||||
import com.tencent.supersonic.headless.core.pojo.QueryContext;
|
||||
import com.tencent.supersonic.headless.core.knowledge.helper.NatureHelper;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.apache.commons.collections4.CollectionUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.stereotype.Service;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@Service
|
||||
@Slf4j
|
||||
public abstract class BaseMatchStrategy<T> implements MatchStrategy<T> {
|
||||
|
||||
@Autowired
|
||||
private MapperHelper mapperHelper;
|
||||
|
||||
@Override
|
||||
public Map<MatchText, List<T>> match(QueryContext queryContext, List<S2Term> terms,
|
||||
Set<Long> detectDataSetIds) {
|
||||
String text = queryContext.getQueryText();
|
||||
if (Objects.isNull(terms) || StringUtils.isEmpty(text)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
log.debug("terms:{},,detectDataSetIds:{}", terms, detectDataSetIds);
|
||||
|
||||
List<T> detects = detect(queryContext, terms, detectDataSetIds);
|
||||
Map<MatchText, List<T>> result = new HashMap<>();
|
||||
|
||||
result.put(MatchText.builder().regText(text).detectSegment(text).build(), detects);
|
||||
return result;
|
||||
}
|
||||
|
||||
public List<T> detect(QueryContext queryContext, List<S2Term> terms, Set<Long> detectDataSetIds) {
|
||||
Map<Integer, Integer> regOffsetToLength = getRegOffsetToLength(terms);
|
||||
String text = queryContext.getQueryText();
|
||||
Set<T> results = new HashSet<>();
|
||||
|
||||
Set<String> detectSegments = new HashSet<>();
|
||||
|
||||
for (Integer startIndex = 0; startIndex <= text.length() - 1; ) {
|
||||
|
||||
for (Integer index = startIndex; index <= text.length(); ) {
|
||||
int offset = mapperHelper.getStepOffset(terms, startIndex);
|
||||
index = mapperHelper.getStepIndex(regOffsetToLength, index);
|
||||
if (index <= text.length()) {
|
||||
String detectSegment = text.substring(startIndex, index).trim();
|
||||
detectSegments.add(detectSegment);
|
||||
detectByStep(queryContext, results, detectDataSetIds, detectSegment, offset);
|
||||
}
|
||||
}
|
||||
startIndex = mapperHelper.getStepIndex(regOffsetToLength, startIndex);
|
||||
}
|
||||
detectByBatch(queryContext, results, detectDataSetIds, detectSegments);
|
||||
return new ArrayList<>(results);
|
||||
}
|
||||
|
||||
protected void detectByBatch(QueryContext queryContext, Set<T> results, Set<Long> detectDataSetIds,
|
||||
Set<String> detectSegments) {
|
||||
return;
|
||||
}
|
||||
|
||||
public Map<Integer, Integer> getRegOffsetToLength(List<S2Term> terms) {
|
||||
return terms.stream().sorted(Comparator.comparing(S2Term::length))
|
||||
.collect(Collectors.toMap(S2Term::getOffset, term -> term.word.length(),
|
||||
(value1, value2) -> value2));
|
||||
}
|
||||
|
||||
public void selectResultInOneRound(Set<T> existResults, List<T> oneRoundResults) {
|
||||
if (CollectionUtils.isEmpty(oneRoundResults)) {
|
||||
return;
|
||||
}
|
||||
for (T oneRoundResult : oneRoundResults) {
|
||||
if (existResults.contains(oneRoundResult)) {
|
||||
boolean isDeleted = existResults.removeIf(
|
||||
existResult -> {
|
||||
boolean delete = needDelete(oneRoundResult, existResult);
|
||||
if (delete) {
|
||||
log.info("deleted existResult:{}", existResult);
|
||||
}
|
||||
return delete;
|
||||
}
|
||||
);
|
||||
if (isDeleted) {
|
||||
log.info("deleted, add oneRoundResult:{}", oneRoundResult);
|
||||
existResults.add(oneRoundResult);
|
||||
}
|
||||
} else {
|
||||
existResults.add(oneRoundResult);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public List<T> getMatches(QueryContext queryContext, List<S2Term> terms) {
|
||||
Set<Long> dataSetIds = queryContext.getDataSetIds();
|
||||
terms = filterByDataSetId(terms, dataSetIds);
|
||||
Map<MatchText, List<T>> matchResult = match(queryContext, terms, dataSetIds);
|
||||
List<T> matches = new ArrayList<>();
|
||||
if (Objects.isNull(matchResult)) {
|
||||
return matches;
|
||||
}
|
||||
Optional<List<T>> first = matchResult.entrySet().stream()
|
||||
.filter(entry -> CollectionUtils.isNotEmpty(entry.getValue()))
|
||||
.map(entry -> entry.getValue()).findFirst();
|
||||
|
||||
if (first.isPresent()) {
|
||||
matches = first.get();
|
||||
}
|
||||
return matches;
|
||||
}
|
||||
|
||||
public List<S2Term> filterByDataSetId(List<S2Term> terms, Set<Long> dataSetIds) {
|
||||
logTerms(terms);
|
||||
if (CollectionUtils.isNotEmpty(dataSetIds)) {
|
||||
terms = terms.stream().filter(term -> {
|
||||
Long dataSetId = NatureHelper.getDataSetId(term.getNature().toString());
|
||||
if (Objects.nonNull(dataSetId)) {
|
||||
return dataSetIds.contains(dataSetId);
|
||||
}
|
||||
return false;
|
||||
}).collect(Collectors.toList());
|
||||
log.info("terms filter by dataSetId:{}", dataSetIds);
|
||||
logTerms(terms);
|
||||
}
|
||||
return terms;
|
||||
}
|
||||
|
||||
public void logTerms(List<S2Term> terms) {
|
||||
if (CollectionUtils.isEmpty(terms)) {
|
||||
return;
|
||||
}
|
||||
for (S2Term term : terms) {
|
||||
log.debug("word:{},nature:{},frequency:{}", term.word, term.nature.toString(), term.getFrequency());
|
||||
}
|
||||
}
|
||||
|
||||
public abstract boolean needDelete(T oneRoundResult, T existResult);
|
||||
|
||||
public abstract String getMapKey(T a);
|
||||
|
||||
public abstract void detectByStep(QueryContext queryContext, Set<T> existResults, Set<Long> detectDataSetIds,
|
||||
String detectSegment, int offset);
|
||||
|
||||
}
|
||||
@@ -0,0 +1,126 @@
|
||||
package com.tencent.supersonic.headless.core.chat.mapper;
|
||||
|
||||
|
||||
import com.tencent.supersonic.common.pojo.Constants;
|
||||
import com.tencent.supersonic.headless.api.pojo.SchemaElement;
|
||||
import com.tencent.supersonic.headless.api.pojo.SchemaElementMatch;
|
||||
import com.tencent.supersonic.headless.api.pojo.response.S2Term;
|
||||
import com.tencent.supersonic.headless.core.config.OptimizationConfig;
|
||||
import com.tencent.supersonic.headless.core.pojo.QueryContext;
|
||||
import com.tencent.supersonic.headless.core.knowledge.DatabaseMapResult;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.stereotype.Service;
|
||||
import org.springframework.util.CollectionUtils;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Map.Entry;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/**
|
||||
* DatabaseMatchStrategy uses SQL LIKE operator to match schema elements.
|
||||
* It currently supports fuzzy matching against names and aliases.
|
||||
*/
|
||||
@Service
|
||||
@Slf4j
|
||||
public class DatabaseMatchStrategy extends BaseMatchStrategy<DatabaseMapResult> {
|
||||
|
||||
@Autowired
|
||||
private OptimizationConfig optimizationConfig;
|
||||
@Autowired
|
||||
private MapperHelper mapperHelper;
|
||||
private List<SchemaElement> allElements;
|
||||
|
||||
@Override
|
||||
public Map<MatchText, List<DatabaseMapResult>> match(QueryContext queryContext, List<S2Term> terms,
|
||||
Set<Long> detectDataSetIds) {
|
||||
this.allElements = getSchemaElements(queryContext);
|
||||
return super.match(queryContext, terms, detectDataSetIds);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean needDelete(DatabaseMapResult oneRoundResult, DatabaseMapResult existResult) {
|
||||
return getMapKey(oneRoundResult).equals(getMapKey(existResult))
|
||||
&& existResult.getDetectWord().length() < oneRoundResult.getDetectWord().length();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getMapKey(DatabaseMapResult a) {
|
||||
return a.getName() + Constants.UNDERLINE + a.getSchemaElement().getId()
|
||||
+ Constants.UNDERLINE + a.getSchemaElement().getName();
|
||||
}
|
||||
|
||||
public void detectByStep(QueryContext queryContext, Set<DatabaseMapResult> existResults, Set<Long> detectDataSetIds,
|
||||
String detectSegment, int offset) {
|
||||
if (StringUtils.isBlank(detectSegment)) {
|
||||
return;
|
||||
}
|
||||
|
||||
Double metricDimensionThresholdConfig = getThreshold(queryContext);
|
||||
Map<String, Set<SchemaElement>> nameToItems = getNameToItems(allElements);
|
||||
|
||||
for (Entry<String, Set<SchemaElement>> entry : nameToItems.entrySet()) {
|
||||
String name = entry.getKey();
|
||||
if (!name.contains(detectSegment)
|
||||
|| mapperHelper.getSimilarity(detectSegment, name) < metricDimensionThresholdConfig) {
|
||||
continue;
|
||||
}
|
||||
Set<SchemaElement> schemaElements = entry.getValue();
|
||||
if (!CollectionUtils.isEmpty(detectDataSetIds)) {
|
||||
schemaElements = schemaElements.stream()
|
||||
.filter(schemaElement -> detectDataSetIds.contains(schemaElement.getDataSet()))
|
||||
.collect(Collectors.toSet());
|
||||
}
|
||||
for (SchemaElement schemaElement : schemaElements) {
|
||||
DatabaseMapResult databaseMapResult = new DatabaseMapResult();
|
||||
databaseMapResult.setDetectWord(detectSegment);
|
||||
databaseMapResult.setName(schemaElement.getName());
|
||||
databaseMapResult.setSchemaElement(schemaElement);
|
||||
existResults.add(databaseMapResult);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private List<SchemaElement> getSchemaElements(QueryContext queryContext) {
|
||||
List<SchemaElement> allElements = new ArrayList<>();
|
||||
allElements.addAll(queryContext.getSemanticSchema().getDimensions());
|
||||
allElements.addAll(queryContext.getSemanticSchema().getMetrics());
|
||||
return allElements;
|
||||
}
|
||||
|
||||
private Double getThreshold(QueryContext queryContext) {
|
||||
Double metricDimensionThresholdConfig = optimizationConfig.getMetricDimensionThresholdConfig();
|
||||
Double metricDimensionMinThresholdConfig = optimizationConfig.getMetricDimensionMinThresholdConfig();
|
||||
|
||||
Map<Long, List<SchemaElementMatch>> modelElementMatches = queryContext.getMapInfo().getDataSetElementMatches();
|
||||
|
||||
boolean existElement = modelElementMatches.entrySet().stream().anyMatch(entry -> entry.getValue().size() >= 1);
|
||||
|
||||
if (!existElement) {
|
||||
double halfThreshold = metricDimensionThresholdConfig / 2;
|
||||
|
||||
metricDimensionThresholdConfig = halfThreshold >= metricDimensionMinThresholdConfig ? halfThreshold
|
||||
: metricDimensionMinThresholdConfig;
|
||||
log.info("ModelElementMatches:{} , not exist Element metricDimensionThresholdConfig reduce by half:{}",
|
||||
modelElementMatches, metricDimensionThresholdConfig);
|
||||
}
|
||||
return metricDimensionThresholdConfig;
|
||||
}
|
||||
|
||||
private Map<String, Set<SchemaElement>> getNameToItems(List<SchemaElement> models) {
|
||||
return models.stream().collect(
|
||||
Collectors.toMap(SchemaElement::getName, a -> {
|
||||
Set<SchemaElement> result = new HashSet<>();
|
||||
result.add(a);
|
||||
return result;
|
||||
}, (k1, k2) -> {
|
||||
k1.addAll(k2);
|
||||
return k1;
|
||||
}));
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,60 @@
|
||||
package com.tencent.supersonic.headless.core.chat.mapper;
|
||||
|
||||
|
||||
import com.tencent.supersonic.common.util.ContextUtils;
|
||||
import com.tencent.supersonic.common.util.embedding.Retrieval;
|
||||
import com.tencent.supersonic.headless.api.pojo.SchemaElement;
|
||||
import com.tencent.supersonic.headless.api.pojo.SchemaElementMatch;
|
||||
import com.tencent.supersonic.headless.api.pojo.SchemaElementType;
|
||||
import com.tencent.supersonic.headless.api.pojo.response.S2Term;
|
||||
import com.tencent.supersonic.headless.core.pojo.QueryContext;
|
||||
import com.tencent.supersonic.headless.core.knowledge.EmbeddingResult;
|
||||
import com.tencent.supersonic.headless.core.knowledge.builder.BaseWordBuilder;
|
||||
import com.tencent.supersonic.headless.core.knowledge.helper.HanlpHelper;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
|
||||
/***
|
||||
* A mapper that recognizes schema elements with vector embedding.
|
||||
*/
|
||||
@Slf4j
|
||||
public class EmbeddingMapper extends BaseMapper {
|
||||
|
||||
@Override
|
||||
public void doMap(QueryContext queryContext) {
|
||||
//1. query from embedding by queryText
|
||||
String queryText = queryContext.getQueryText();
|
||||
List<S2Term> terms = HanlpHelper.getTerms(queryText, queryContext.getModelIdToDataSetIds());
|
||||
|
||||
EmbeddingMatchStrategy matchStrategy = ContextUtils.getBean(EmbeddingMatchStrategy.class);
|
||||
List<EmbeddingResult> matchResults = matchStrategy.getMatches(queryContext, terms);
|
||||
|
||||
HanlpHelper.transLetterOriginal(matchResults);
|
||||
|
||||
//2. build SchemaElementMatch by info
|
||||
for (EmbeddingResult matchResult : matchResults) {
|
||||
Long elementId = Retrieval.getLongId(matchResult.getId());
|
||||
Long dataSetId = Retrieval.getLongId(matchResult.getMetadata().get("dataSetId"));
|
||||
if (Objects.isNull(dataSetId)) {
|
||||
continue;
|
||||
}
|
||||
SchemaElementType elementType = SchemaElementType.valueOf(matchResult.getMetadata().get("type"));
|
||||
SchemaElement schemaElement = getSchemaElement(dataSetId, elementType, elementId,
|
||||
queryContext.getSemanticSchema());
|
||||
if (schemaElement == null) {
|
||||
continue;
|
||||
}
|
||||
SchemaElementMatch schemaElementMatch = SchemaElementMatch.builder()
|
||||
.element(schemaElement)
|
||||
.frequency(BaseWordBuilder.DEFAULT_FREQUENCY)
|
||||
.word(matchResult.getName())
|
||||
.similarity(1 - matchResult.getDistance())
|
||||
.detectWord(matchResult.getDetectWord())
|
||||
.build();
|
||||
//3. add to mapInfo
|
||||
addToSchemaMap(queryContext.getMapInfo(), dataSetId, schemaElementMatch);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,125 @@
|
||||
package com.tencent.supersonic.headless.core.chat.mapper;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
import com.tencent.supersonic.common.pojo.Constants;
|
||||
import com.tencent.supersonic.common.util.embedding.Retrieval;
|
||||
import com.tencent.supersonic.common.util.embedding.RetrieveQuery;
|
||||
import com.tencent.supersonic.common.util.embedding.RetrieveQueryResult;
|
||||
import com.tencent.supersonic.headless.core.config.OptimizationConfig;
|
||||
import com.tencent.supersonic.headless.core.knowledge.EmbeddingResult;
|
||||
import com.tencent.supersonic.headless.core.knowledge.MetaEmbeddingService;
|
||||
import com.tencent.supersonic.headless.core.pojo.QueryContext;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.apache.commons.collections.CollectionUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.springframework.beans.BeanUtils;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/**
|
||||
* EmbeddingMatchStrategy uses vector database to perform
|
||||
* similarity search against the embeddings of schema elements.
|
||||
*/
|
||||
@Service
|
||||
@Slf4j
|
||||
public class EmbeddingMatchStrategy extends BaseMatchStrategy<EmbeddingResult> {
|
||||
|
||||
@Autowired
|
||||
private OptimizationConfig optimizationConfig;
|
||||
|
||||
@Autowired
|
||||
private MetaEmbeddingService metaEmbeddingService;
|
||||
|
||||
@Override
|
||||
public boolean needDelete(EmbeddingResult oneRoundResult, EmbeddingResult existResult) {
|
||||
return getMapKey(oneRoundResult).equals(getMapKey(existResult))
|
||||
&& existResult.getDistance() > oneRoundResult.getDistance();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getMapKey(EmbeddingResult a) {
|
||||
return a.getName() + Constants.UNDERLINE + a.getId();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void detectByStep(QueryContext queryContext, Set<EmbeddingResult> existResults, Set<Long> detectDataSetIds,
|
||||
String detectSegment, int offset) {
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void detectByBatch(QueryContext queryContext, Set<EmbeddingResult> results, Set<Long> detectDataSetIds,
|
||||
Set<String> detectSegments) {
|
||||
|
||||
List<String> queryTextsList = detectSegments.stream()
|
||||
.map(detectSegment -> detectSegment.trim())
|
||||
.filter(detectSegment -> StringUtils.isNotBlank(detectSegment)
|
||||
&& detectSegment.length() >= optimizationConfig.getEmbeddingMapperWordMin()
|
||||
&& detectSegment.length() <= optimizationConfig.getEmbeddingMapperWordMax())
|
||||
.collect(Collectors.toList());
|
||||
|
||||
List<List<String>> queryTextsSubList = Lists.partition(queryTextsList,
|
||||
optimizationConfig.getEmbeddingMapperBatch());
|
||||
|
||||
for (List<String> queryTextsSub : queryTextsSubList) {
|
||||
detectByQueryTextsSub(results, detectDataSetIds, queryTextsSub, queryContext.getModelIdToDataSetIds());
|
||||
}
|
||||
}
|
||||
|
||||
private void detectByQueryTextsSub(Set<EmbeddingResult> results, Set<Long> detectDataSetIds,
|
||||
List<String> queryTextsSub, Map<Long, List<Long>> modelIdToDataSetIds) {
|
||||
int embeddingNumber = optimizationConfig.getEmbeddingMapperNumber();
|
||||
Double distance = optimizationConfig.getEmbeddingMapperDistanceThreshold();
|
||||
// step1. build query params
|
||||
RetrieveQuery retrieveQuery = RetrieveQuery.builder().queryTextsList(queryTextsSub).build();
|
||||
// step2. retrieveQuery by detectSegment
|
||||
List<RetrieveQueryResult> retrieveQueryResults = metaEmbeddingService.retrieveQuery(
|
||||
retrieveQuery, embeddingNumber, modelIdToDataSetIds);
|
||||
|
||||
if (CollectionUtils.isEmpty(retrieveQueryResults)) {
|
||||
return;
|
||||
}
|
||||
// step3. build EmbeddingResults
|
||||
List<EmbeddingResult> collect = retrieveQueryResults.stream()
|
||||
.map(retrieveQueryResult -> {
|
||||
List<Retrieval> retrievals = retrieveQueryResult.getRetrieval();
|
||||
if (CollectionUtils.isNotEmpty(retrievals)) {
|
||||
retrievals.removeIf(retrieval -> {
|
||||
if (!retrieveQueryResult.getQuery().contains(retrieval.getQuery())) {
|
||||
return retrieval.getDistance() > distance.doubleValue();
|
||||
}
|
||||
return false;
|
||||
});
|
||||
}
|
||||
return retrieveQueryResult;
|
||||
})
|
||||
.filter(retrieveQueryResult -> CollectionUtils.isNotEmpty(retrieveQueryResult.getRetrieval()))
|
||||
.flatMap(retrieveQueryResult -> retrieveQueryResult.getRetrieval().stream()
|
||||
.map(retrieval -> {
|
||||
EmbeddingResult embeddingResult = new EmbeddingResult();
|
||||
BeanUtils.copyProperties(retrieval, embeddingResult);
|
||||
embeddingResult.setDetectWord(retrieveQueryResult.getQuery());
|
||||
embeddingResult.setName(retrieval.getQuery());
|
||||
Map<String, String> convertedMap = retrieval.getMetadata().entrySet().stream()
|
||||
.collect(Collectors.toMap(Map.Entry::getKey, entry -> entry.getValue().toString()));
|
||||
embeddingResult.setMetadata(convertedMap);
|
||||
return embeddingResult;
|
||||
}))
|
||||
.collect(Collectors.toList());
|
||||
|
||||
// step4. select mapResul in one round
|
||||
int roundNumber = optimizationConfig.getEmbeddingMapperRoundNumber() * queryTextsSub.size();
|
||||
List<EmbeddingResult> oneRoundResults = collect.stream()
|
||||
.sorted(Comparator.comparingDouble(EmbeddingResult::getDistance))
|
||||
.limit(roundNumber)
|
||||
.collect(Collectors.toList());
|
||||
selectResultInOneRound(results, oneRoundResults);
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,75 @@
|
||||
package com.tencent.supersonic.headless.core.chat.mapper;
|
||||
|
||||
import com.tencent.supersonic.headless.api.pojo.DataSetSchema;
|
||||
import com.tencent.supersonic.headless.api.pojo.SchemaElement;
|
||||
import com.tencent.supersonic.headless.api.pojo.SchemaElementMatch;
|
||||
import com.tencent.supersonic.headless.api.pojo.SchemaElementType;
|
||||
import com.tencent.supersonic.headless.api.pojo.SchemaMapInfo;
|
||||
import com.tencent.supersonic.headless.api.pojo.SemanticSchema;
|
||||
import com.tencent.supersonic.headless.core.pojo.QueryContext;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.springframework.beans.BeanUtils;
|
||||
import org.springframework.util.CollectionUtils;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/**
|
||||
* A mapper capable of converting the VALUE of entity dimension values into ID types.
|
||||
*/
|
||||
@Slf4j
|
||||
public class EntityMapper extends BaseMapper {
|
||||
|
||||
@Override
|
||||
public void doMap(QueryContext queryContext) {
|
||||
SchemaMapInfo schemaMapInfo = queryContext.getMapInfo();
|
||||
for (Long dataSetId : schemaMapInfo.getMatchedDataSetInfos()) {
|
||||
List<SchemaElementMatch> schemaElementMatchList = schemaMapInfo.getMatchedElements(dataSetId);
|
||||
if (CollectionUtils.isEmpty(schemaElementMatchList)) {
|
||||
continue;
|
||||
}
|
||||
SchemaElement entity = getEntity(dataSetId, queryContext);
|
||||
if (entity == null || entity.getId() == null) {
|
||||
continue;
|
||||
}
|
||||
List<SchemaElementMatch> valueSchemaElements = schemaElementMatchList.stream()
|
||||
.filter(schemaElementMatch ->
|
||||
SchemaElementType.VALUE.equals(schemaElementMatch.getElement().getType()))
|
||||
.collect(Collectors.toList());
|
||||
for (SchemaElementMatch schemaElementMatch : valueSchemaElements) {
|
||||
if (!entity.getId().equals(schemaElementMatch.getElement().getId())) {
|
||||
continue;
|
||||
}
|
||||
if (!checkExistSameEntitySchemaElements(schemaElementMatch, schemaElementMatchList)) {
|
||||
SchemaElementMatch entitySchemaElementMath = new SchemaElementMatch();
|
||||
BeanUtils.copyProperties(schemaElementMatch, entitySchemaElementMath);
|
||||
entitySchemaElementMath.setElement(entity);
|
||||
schemaElementMatchList.add(entitySchemaElementMath);
|
||||
}
|
||||
schemaElementMatch.getElement().setType(SchemaElementType.ID);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private boolean checkExistSameEntitySchemaElements(SchemaElementMatch valueSchemaElementMatch,
|
||||
List<SchemaElementMatch> schemaElementMatchList) {
|
||||
List<SchemaElementMatch> entitySchemaElements = schemaElementMatchList.stream().filter(schemaElementMatch ->
|
||||
SchemaElementType.ENTITY.equals(schemaElementMatch.getElement().getType()))
|
||||
.collect(Collectors.toList());
|
||||
for (SchemaElementMatch schemaElementMatch : entitySchemaElements) {
|
||||
if (schemaElementMatch.getElement().getId().equals(valueSchemaElementMatch.getElement().getId())) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
private SchemaElement getEntity(Long dataSetId, QueryContext queryContext) {
|
||||
SemanticSchema semanticSchema = queryContext.getSemanticSchema();
|
||||
DataSetSchema modelSchema = semanticSchema.getDataSetSchemaMap().get(dataSetId);
|
||||
if (modelSchema != null && modelSchema.getEntity() != null) {
|
||||
return modelSchema.getEntity();
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,121 @@
|
||||
package com.tencent.supersonic.headless.core.chat.mapper;
|
||||
|
||||
import com.tencent.supersonic.common.pojo.Constants;
|
||||
import com.tencent.supersonic.headless.api.pojo.response.S2Term;
|
||||
import com.tencent.supersonic.headless.core.config.OptimizationConfig;
|
||||
import com.tencent.supersonic.headless.core.pojo.QueryContext;
|
||||
import com.tencent.supersonic.headless.core.knowledge.HanlpMapResult;
|
||||
import com.tencent.supersonic.headless.core.knowledge.KnowledgeService;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.apache.commons.collections.CollectionUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.stereotype.Service;
|
||||
import java.util.HashMap;
|
||||
import java.util.LinkedHashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/**
|
||||
* HanlpDictMatchStrategy uses <a href="https://www.hanlp.com/">HanLP</a> to
|
||||
* match schema elements. It currently supports prefix and suffix matching
|
||||
* against names, values and aliases.
|
||||
*/
|
||||
@Service
|
||||
@Slf4j
|
||||
public class HanlpDictMatchStrategy extends BaseMatchStrategy<HanlpMapResult> {
|
||||
|
||||
@Autowired
|
||||
private MapperHelper mapperHelper;
|
||||
|
||||
@Autowired
|
||||
private OptimizationConfig optimizationConfig;
|
||||
|
||||
@Autowired
|
||||
private KnowledgeService knowledgeService;
|
||||
|
||||
@Override
|
||||
public Map<MatchText, List<HanlpMapResult>> match(QueryContext queryContext, List<S2Term> terms,
|
||||
Set<Long> detectDataSetIds) {
|
||||
String text = queryContext.getQueryText();
|
||||
if (Objects.isNull(terms) || StringUtils.isEmpty(text)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
log.debug("retryCount:{},terms:{},,detectModelIds:{}", terms, detectDataSetIds);
|
||||
|
||||
List<HanlpMapResult> detects = detect(queryContext, terms, detectDataSetIds);
|
||||
Map<MatchText, List<HanlpMapResult>> result = new HashMap<>();
|
||||
|
||||
result.put(MatchText.builder().regText(text).detectSegment(text).build(), detects);
|
||||
return result;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean needDelete(HanlpMapResult oneRoundResult, HanlpMapResult existResult) {
|
||||
return getMapKey(oneRoundResult).equals(getMapKey(existResult))
|
||||
&& existResult.getDetectWord().length() < oneRoundResult.getDetectWord().length();
|
||||
}
|
||||
|
||||
public void detectByStep(QueryContext queryContext, Set<HanlpMapResult> existResults, Set<Long> detectDataSetIds,
|
||||
String detectSegment, int offset) {
|
||||
// step1. pre search
|
||||
Integer oneDetectionMaxSize = optimizationConfig.getOneDetectionMaxSize();
|
||||
LinkedHashSet<HanlpMapResult> hanlpMapResults = knowledgeService.prefixSearch(detectSegment,
|
||||
oneDetectionMaxSize, queryContext.getModelIdToDataSetIds())
|
||||
.stream().collect(Collectors.toCollection(LinkedHashSet::new));
|
||||
// step2. suffix search
|
||||
LinkedHashSet<HanlpMapResult> suffixHanlpMapResults = knowledgeService.suffixSearch(detectSegment,
|
||||
oneDetectionMaxSize, queryContext.getModelIdToDataSetIds())
|
||||
.stream().collect(Collectors.toCollection(LinkedHashSet::new));
|
||||
|
||||
hanlpMapResults.addAll(suffixHanlpMapResults);
|
||||
|
||||
if (CollectionUtils.isEmpty(hanlpMapResults)) {
|
||||
return;
|
||||
}
|
||||
// step3. merge pre/suffix result
|
||||
hanlpMapResults = hanlpMapResults.stream().sorted((a, b) -> -(b.getName().length() - a.getName().length()))
|
||||
.collect(Collectors.toCollection(LinkedHashSet::new));
|
||||
|
||||
// step4. filter by similarity
|
||||
hanlpMapResults = hanlpMapResults.stream()
|
||||
.filter(term -> mapperHelper.getSimilarity(detectSegment, term.getName())
|
||||
>= mapperHelper.getThresholdMatch(term.getNatures()))
|
||||
.filter(term -> CollectionUtils.isNotEmpty(term.getNatures()))
|
||||
.collect(Collectors.toCollection(LinkedHashSet::new));
|
||||
|
||||
log.info("after isSimilarity parseResults:{}", hanlpMapResults);
|
||||
|
||||
hanlpMapResults = hanlpMapResults.stream().map(parseResult -> {
|
||||
parseResult.setOffset(offset);
|
||||
parseResult.setSimilarity(mapperHelper.getSimilarity(detectSegment, parseResult.getName()));
|
||||
return parseResult;
|
||||
}).collect(Collectors.toCollection(LinkedHashSet::new));
|
||||
|
||||
// step5. take only one dimension or 10 metric/dimension value per rond.
|
||||
List<HanlpMapResult> dimensionMetrics = hanlpMapResults.stream()
|
||||
.filter(entry -> mapperHelper.existDimensionValues(entry.getNatures()))
|
||||
.collect(Collectors.toList())
|
||||
.stream()
|
||||
.limit(1)
|
||||
.collect(Collectors.toList());
|
||||
|
||||
Integer oneDetectionSize = optimizationConfig.getOneDetectionSize();
|
||||
List<HanlpMapResult> oneRoundResults = hanlpMapResults.stream().limit(oneDetectionSize)
|
||||
.collect(Collectors.toList());
|
||||
|
||||
if (CollectionUtils.isNotEmpty(dimensionMetrics)) {
|
||||
oneRoundResults = dimensionMetrics;
|
||||
}
|
||||
// step6. select mapResul in one round
|
||||
selectResultInOneRound(existResults, oneRoundResults);
|
||||
}
|
||||
|
||||
public String getMapKey(HanlpMapResult a) {
|
||||
return a.getName() + Constants.UNDERLINE + String.join(Constants.UNDERLINE, a.getNatures());
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,124 @@
|
||||
package com.tencent.supersonic.headless.core.chat.mapper;
|
||||
|
||||
import com.tencent.supersonic.common.util.ContextUtils;
|
||||
import com.tencent.supersonic.headless.api.pojo.SchemaElement;
|
||||
import com.tencent.supersonic.headless.api.pojo.SchemaElementMatch;
|
||||
import com.tencent.supersonic.headless.api.pojo.SchemaElementType;
|
||||
import com.tencent.supersonic.headless.api.pojo.SchemaMapInfo;
|
||||
import com.tencent.supersonic.headless.api.pojo.response.S2Term;
|
||||
import com.tencent.supersonic.headless.core.pojo.QueryContext;
|
||||
import com.tencent.supersonic.headless.core.knowledge.DatabaseMapResult;
|
||||
import com.tencent.supersonic.headless.core.knowledge.HanlpMapResult;
|
||||
import com.tencent.supersonic.headless.core.knowledge.helper.HanlpHelper;
|
||||
import com.tencent.supersonic.headless.core.knowledge.helper.NatureHelper;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.springframework.util.CollectionUtils;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/***
|
||||
* A mapper that recognizes schema elements with keyword.
|
||||
* It leverages two matching strategies: HanlpDictMatchStrategy and DatabaseMatchStrategy.
|
||||
*/
|
||||
@Slf4j
|
||||
public class KeywordMapper extends BaseMapper {
|
||||
|
||||
@Override
|
||||
public void doMap(QueryContext queryContext) {
|
||||
String queryText = queryContext.getQueryText();
|
||||
//1.hanlpDict Match
|
||||
List<S2Term> terms = HanlpHelper.getTerms(queryText, queryContext.getModelIdToDataSetIds());
|
||||
HanlpDictMatchStrategy hanlpMatchStrategy = ContextUtils.getBean(HanlpDictMatchStrategy.class);
|
||||
|
||||
List<HanlpMapResult> hanlpMapResults = hanlpMatchStrategy.getMatches(queryContext, terms);
|
||||
convertHanlpMapResultToMapInfo(hanlpMapResults, queryContext, terms);
|
||||
|
||||
//2.database Match
|
||||
DatabaseMatchStrategy databaseMatchStrategy = ContextUtils.getBean(DatabaseMatchStrategy.class);
|
||||
|
||||
List<DatabaseMapResult> databaseResults = databaseMatchStrategy.getMatches(queryContext, terms);
|
||||
convertDatabaseMapResultToMapInfo(queryContext, databaseResults);
|
||||
}
|
||||
|
||||
private void convertHanlpMapResultToMapInfo(List<HanlpMapResult> mapResults, QueryContext queryContext,
|
||||
List<S2Term> terms) {
|
||||
if (CollectionUtils.isEmpty(mapResults)) {
|
||||
return;
|
||||
}
|
||||
HanlpHelper.transLetterOriginal(mapResults);
|
||||
Map<String, Long> wordNatureToFrequency = terms.stream().collect(
|
||||
Collectors.toMap(entry -> entry.getWord() + entry.getNature(),
|
||||
term -> Long.valueOf(term.getFrequency()), (value1, value2) -> value2));
|
||||
|
||||
for (HanlpMapResult hanlpMapResult : mapResults) {
|
||||
for (String nature : hanlpMapResult.getNatures()) {
|
||||
Long dataSetId = NatureHelper.getDataSetId(nature);
|
||||
if (Objects.isNull(dataSetId)) {
|
||||
continue;
|
||||
}
|
||||
SchemaElementType elementType = NatureHelper.convertToElementType(nature);
|
||||
if (Objects.isNull(elementType)) {
|
||||
continue;
|
||||
}
|
||||
Long elementID = NatureHelper.getElementID(nature);
|
||||
SchemaElement element = getSchemaElement(dataSetId, elementType,
|
||||
elementID, queryContext.getSemanticSchema());
|
||||
if (element == null) {
|
||||
continue;
|
||||
}
|
||||
if (element.getType().equals(SchemaElementType.VALUE) || element.getType()
|
||||
.equals(SchemaElementType.TAG_VALUE)) {
|
||||
element.setName(hanlpMapResult.getName());
|
||||
}
|
||||
Long frequency = wordNatureToFrequency.get(hanlpMapResult.getName() + nature);
|
||||
SchemaElementMatch schemaElementMatch = SchemaElementMatch.builder()
|
||||
.element(element)
|
||||
.frequency(frequency)
|
||||
.word(hanlpMapResult.getName())
|
||||
.similarity(hanlpMapResult.getSimilarity())
|
||||
.detectWord(hanlpMapResult.getDetectWord())
|
||||
.build();
|
||||
|
||||
addToSchemaMap(queryContext.getMapInfo(), dataSetId, schemaElementMatch);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void convertDatabaseMapResultToMapInfo(QueryContext queryContext, List<DatabaseMapResult> mapResults) {
|
||||
MapperHelper mapperHelper = ContextUtils.getBean(MapperHelper.class);
|
||||
for (DatabaseMapResult match : mapResults) {
|
||||
SchemaElement schemaElement = match.getSchemaElement();
|
||||
Set<Long> regElementSet = getRegElementSet(queryContext.getMapInfo(), schemaElement);
|
||||
if (regElementSet.contains(schemaElement.getId())) {
|
||||
continue;
|
||||
}
|
||||
SchemaElementMatch schemaElementMatch = SchemaElementMatch.builder()
|
||||
.element(schemaElement)
|
||||
.word(schemaElement.getName())
|
||||
.detectWord(match.getDetectWord())
|
||||
.frequency(10000L)
|
||||
.similarity(mapperHelper.getSimilarity(match.getDetectWord(), schemaElement.getName()))
|
||||
.build();
|
||||
log.info("add to schema, elementMatch {}", schemaElementMatch);
|
||||
addToSchemaMap(queryContext.getMapInfo(), schemaElement.getDataSet(), schemaElementMatch);
|
||||
}
|
||||
}
|
||||
|
||||
private Set<Long> getRegElementSet(SchemaMapInfo schemaMap, SchemaElement schemaElement) {
|
||||
List<SchemaElementMatch> elements = schemaMap.getMatchedElements(schemaElement.getDataSet());
|
||||
if (CollectionUtils.isEmpty(elements)) {
|
||||
return new HashSet<>();
|
||||
}
|
||||
return elements.stream()
|
||||
.filter(elementMatch ->
|
||||
SchemaElementType.METRIC.equals(elementMatch.getElement().getType())
|
||||
|| SchemaElementType.DIMENSION.equals(elementMatch.getElement().getType()))
|
||||
.map(elementMatch -> elementMatch.getElement().getId())
|
||||
.collect(Collectors.toSet());
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,81 @@
|
||||
package com.tencent.supersonic.headless.core.chat.mapper;
|
||||
|
||||
import com.hankcs.hanlp.algorithm.EditDistance;
|
||||
import com.tencent.supersonic.headless.api.pojo.response.S2Term;
|
||||
import com.tencent.supersonic.headless.core.config.OptimizationConfig;
|
||||
import com.tencent.supersonic.headless.core.knowledge.helper.NatureHelper;
|
||||
import lombok.Data;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@Data
|
||||
@Service
|
||||
@Slf4j
|
||||
public class MapperHelper {
|
||||
|
||||
@Autowired
|
||||
private OptimizationConfig optimizationConfig;
|
||||
|
||||
public Integer getStepIndex(Map<Integer, Integer> regOffsetToLength, Integer index) {
|
||||
Integer subRegLength = regOffsetToLength.get(index);
|
||||
if (Objects.nonNull(subRegLength)) {
|
||||
index = index + subRegLength;
|
||||
} else {
|
||||
index++;
|
||||
}
|
||||
return index;
|
||||
}
|
||||
|
||||
public Integer getStepOffset(List<S2Term> termList, Integer index) {
|
||||
List<Integer> offsetList = termList.stream().sorted(Comparator.comparing(S2Term::getOffset))
|
||||
.map(term -> term.getOffset()).collect(Collectors.toList());
|
||||
|
||||
for (int j = 0; j < termList.size() - 1; j++) {
|
||||
if (offsetList.get(j) <= index && offsetList.get(j + 1) > index) {
|
||||
return offsetList.get(j);
|
||||
}
|
||||
}
|
||||
return index;
|
||||
}
|
||||
|
||||
public double getThresholdMatch(List<String> natures) {
|
||||
if (existDimensionValues(natures)) {
|
||||
return optimizationConfig.getDimensionValueThresholdConfig();
|
||||
}
|
||||
return optimizationConfig.getMetricDimensionThresholdConfig();
|
||||
}
|
||||
|
||||
/***
|
||||
* exist dimension values
|
||||
* @param natures
|
||||
* @return
|
||||
*/
|
||||
public boolean existDimensionValues(List<String> natures) {
|
||||
for (String nature : natures) {
|
||||
if (NatureHelper.isDimensionValueDataSetId(nature)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/***
|
||||
* get similarity
|
||||
* @param detectSegment
|
||||
* @param matchName
|
||||
* @return
|
||||
*/
|
||||
public double getSimilarity(String detectSegment, String matchName) {
|
||||
String detectSegmentLower = detectSegment == null ? null : detectSegment.toLowerCase();
|
||||
String matchNameLower = matchName == null ? null : matchName.toLowerCase();
|
||||
return 1 - (double) EditDistance.compute(detectSegmentLower, matchNameLower) / Math.max(matchName.length(),
|
||||
detectSegment.length());
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,19 @@
|
||||
package com.tencent.supersonic.headless.core.chat.mapper;
|
||||
|
||||
|
||||
import com.tencent.supersonic.headless.api.pojo.response.S2Term;
|
||||
import com.tencent.supersonic.headless.core.pojo.QueryContext;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
* MatchStrategy encapsulates a concrete matching algorithm
|
||||
* executed during query or search process.
|
||||
*/
|
||||
public interface MatchStrategy<T> {
|
||||
|
||||
Map<MatchText, List<T>> match(QueryContext queryContext, List<S2Term> terms, Set<Long> detectDataSetIds);
|
||||
|
||||
}
|
||||
@@ -0,0 +1,34 @@
|
||||
package com.tencent.supersonic.headless.core.chat.mapper;
|
||||
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.ToString;
|
||||
|
||||
import java.util.Objects;
|
||||
|
||||
@Data
|
||||
@ToString
|
||||
@Builder
|
||||
public class MatchText {
|
||||
|
||||
private String regText;
|
||||
|
||||
private String detectSegment;
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
if (this == o) {
|
||||
return true;
|
||||
}
|
||||
if (o == null || getClass() != o.getClass()) {
|
||||
return false;
|
||||
}
|
||||
MatchText that = (MatchText) o;
|
||||
return Objects.equals(regText, that.regText) && Objects.equals(detectSegment, that.detectSegment);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return Objects.hash(regText, detectSegment);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,20 @@
|
||||
package com.tencent.supersonic.headless.core.chat.mapper;
|
||||
|
||||
import com.tencent.supersonic.headless.api.pojo.SchemaElementType;
|
||||
import lombok.Data;
|
||||
import lombok.ToString;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
@Data
|
||||
@ToString
|
||||
public class ModelWithSemanticType implements Serializable {
|
||||
|
||||
private Long model;
|
||||
private SchemaElementType schemaElementType;
|
||||
|
||||
public ModelWithSemanticType(Long model, SchemaElementType schemaElementType) {
|
||||
this.model = model;
|
||||
this.schemaElementType = schemaElementType;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,94 @@
|
||||
package com.tencent.supersonic.headless.core.chat.mapper;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
import com.tencent.supersonic.common.pojo.Constants;
|
||||
import com.tencent.supersonic.headless.api.pojo.SchemaElement;
|
||||
import com.tencent.supersonic.headless.api.pojo.SchemaElementMatch;
|
||||
import com.tencent.supersonic.headless.api.pojo.SchemaElementType;
|
||||
import com.tencent.supersonic.headless.api.pojo.SchemaMapInfo;
|
||||
import com.tencent.supersonic.headless.api.pojo.request.QueryFilter;
|
||||
import com.tencent.supersonic.headless.api.pojo.request.QueryFilters;
|
||||
import com.tencent.supersonic.headless.core.pojo.QueryContext;
|
||||
import com.tencent.supersonic.headless.core.knowledge.builder.BaseWordBuilder;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.springframework.util.CollectionUtils;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@Slf4j
|
||||
public class QueryFilterMapper implements SchemaMapper {
|
||||
|
||||
private double similarity = 1.0;
|
||||
|
||||
@Override
|
||||
public void map(QueryContext queryContext) {
|
||||
Set<Long> dataSetIds = queryContext.getDataSetIds();
|
||||
if (CollectionUtils.isEmpty(dataSetIds)) {
|
||||
return;
|
||||
}
|
||||
SchemaMapInfo schemaMapInfo = queryContext.getMapInfo();
|
||||
clearOtherSchemaElementMatch(dataSetIds, schemaMapInfo);
|
||||
for (Long dataSetId : dataSetIds) {
|
||||
List<SchemaElementMatch> schemaElementMatches = schemaMapInfo.getMatchedElements(dataSetId);
|
||||
if (schemaElementMatches == null) {
|
||||
schemaElementMatches = Lists.newArrayList();
|
||||
schemaMapInfo.setMatchedElements(dataSetId, schemaElementMatches);
|
||||
}
|
||||
addValueSchemaElementMatch(dataSetId, queryContext, schemaElementMatches);
|
||||
}
|
||||
}
|
||||
|
||||
private void clearOtherSchemaElementMatch(Set<Long> viewIds, SchemaMapInfo schemaMapInfo) {
|
||||
for (Map.Entry<Long, List<SchemaElementMatch>> entry : schemaMapInfo.getDataSetElementMatches().entrySet()) {
|
||||
if (!viewIds.contains(entry.getKey())) {
|
||||
entry.getValue().clear();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private List<SchemaElementMatch> addValueSchemaElementMatch(Long viewId, QueryContext queryContext,
|
||||
List<SchemaElementMatch> candidateElementMatches) {
|
||||
QueryFilters queryFilters = queryContext.getQueryFilters();
|
||||
if (queryFilters == null || CollectionUtils.isEmpty(queryFilters.getFilters())) {
|
||||
return candidateElementMatches;
|
||||
}
|
||||
for (QueryFilter filter : queryFilters.getFilters()) {
|
||||
if (checkExistSameValueSchemaElementMatch(filter, candidateElementMatches)) {
|
||||
continue;
|
||||
}
|
||||
SchemaElement element = SchemaElement.builder()
|
||||
.id(filter.getElementID())
|
||||
.name(String.valueOf(filter.getValue()))
|
||||
.type(SchemaElementType.VALUE)
|
||||
.bizName(filter.getBizName())
|
||||
.dataSet(viewId)
|
||||
.build();
|
||||
SchemaElementMatch schemaElementMatch = SchemaElementMatch.builder()
|
||||
.element(element)
|
||||
.frequency(BaseWordBuilder.DEFAULT_FREQUENCY)
|
||||
.word(String.valueOf(filter.getValue()))
|
||||
.similarity(similarity)
|
||||
.detectWord(Constants.EMPTY)
|
||||
.build();
|
||||
candidateElementMatches.add(schemaElementMatch);
|
||||
}
|
||||
return candidateElementMatches;
|
||||
}
|
||||
|
||||
private boolean checkExistSameValueSchemaElementMatch(QueryFilter queryFilter,
|
||||
List<SchemaElementMatch> schemaElementMatches) {
|
||||
List<SchemaElementMatch> valueSchemaElements = schemaElementMatches.stream().filter(schemaElementMatch ->
|
||||
SchemaElementType.VALUE.equals(schemaElementMatch.getElement().getType()))
|
||||
.collect(Collectors.toList());
|
||||
for (SchemaElementMatch schemaElementMatch : valueSchemaElements) {
|
||||
if (schemaElementMatch.getElement().getId().equals(queryFilter.getElementID())
|
||||
&& schemaElementMatch.getWord().equals(String.valueOf(queryFilter.getValue()))) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,13 @@
|
||||
package com.tencent.supersonic.headless.core.chat.mapper;
|
||||
|
||||
|
||||
import com.tencent.supersonic.headless.core.pojo.QueryContext;
|
||||
|
||||
/**
|
||||
* A schema mapper identifies references to schema elements(metrics/dimensions/entities/values)
|
||||
* in user queries. It matches the query text against the knowledge base.
|
||||
*/
|
||||
public interface SchemaMapper {
|
||||
|
||||
void map(QueryContext queryContext);
|
||||
}
|
||||
@@ -0,0 +1,102 @@
|
||||
package com.tencent.supersonic.headless.core.chat.mapper;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
import com.tencent.supersonic.common.pojo.enums.DictWordType;
|
||||
import com.tencent.supersonic.headless.api.pojo.response.S2Term;
|
||||
import com.tencent.supersonic.headless.core.pojo.QueryContext;
|
||||
import com.tencent.supersonic.headless.core.knowledge.HanlpMapResult;
|
||||
import com.tencent.supersonic.headless.core.knowledge.KnowledgeService;
|
||||
import com.tencent.supersonic.headless.core.knowledge.SearchService;
|
||||
import org.apache.commons.collections.CollectionUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/**
|
||||
* SearchMatchStrategy encapsulates a concrete matching algorithm
|
||||
* executed during search process.
|
||||
*/
|
||||
@Service
|
||||
public class SearchMatchStrategy extends BaseMatchStrategy<HanlpMapResult> {
|
||||
|
||||
private static final int SEARCH_SIZE = 3;
|
||||
|
||||
@Autowired
|
||||
private KnowledgeService knowledgeService;
|
||||
|
||||
@Override
|
||||
public Map<MatchText, List<HanlpMapResult>> match(QueryContext queryContext, List<S2Term> originals,
|
||||
Set<Long> detectDataSetIds) {
|
||||
String text = queryContext.getQueryText();
|
||||
Map<Integer, Integer> regOffsetToLength = getRegOffsetToLength(originals);
|
||||
|
||||
List<Integer> detectIndexList = Lists.newArrayList();
|
||||
|
||||
for (Integer index = 0; index < text.length(); ) {
|
||||
|
||||
if (index < text.length()) {
|
||||
detectIndexList.add(index);
|
||||
}
|
||||
Integer regLength = regOffsetToLength.get(index);
|
||||
if (Objects.nonNull(regLength)) {
|
||||
index = index + regLength;
|
||||
} else {
|
||||
index++;
|
||||
}
|
||||
}
|
||||
Map<MatchText, List<HanlpMapResult>> regTextMap = new ConcurrentHashMap<>();
|
||||
detectIndexList.stream().parallel().forEach(detectIndex -> {
|
||||
String regText = text.substring(0, detectIndex);
|
||||
String detectSegment = text.substring(detectIndex);
|
||||
|
||||
if (StringUtils.isNotEmpty(detectSegment)) {
|
||||
List<HanlpMapResult> hanlpMapResults = knowledgeService.prefixSearch(detectSegment,
|
||||
SearchService.SEARCH_SIZE, queryContext.getModelIdToDataSetIds());
|
||||
List<HanlpMapResult> suffixHanlpMapResults = knowledgeService.suffixSearch(
|
||||
detectSegment, SEARCH_SIZE, queryContext.getModelIdToDataSetIds());
|
||||
hanlpMapResults.addAll(suffixHanlpMapResults);
|
||||
// remove entity name where search
|
||||
hanlpMapResults = hanlpMapResults.stream().filter(entry -> {
|
||||
List<String> natures = entry.getNatures().stream()
|
||||
.filter(nature -> !nature.endsWith(DictWordType.ENTITY.getTypeWithSpilt()))
|
||||
.collect(Collectors.toList());
|
||||
if (CollectionUtils.isEmpty(natures)) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}).collect(Collectors.toList());
|
||||
MatchText matchText = MatchText.builder()
|
||||
.regText(regText)
|
||||
.detectSegment(detectSegment)
|
||||
.build();
|
||||
regTextMap.put(matchText, hanlpMapResults);
|
||||
}
|
||||
}
|
||||
);
|
||||
return regTextMap;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean needDelete(HanlpMapResult oneRoundResult, HanlpMapResult existResult) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getMapKey(HanlpMapResult a) {
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void detectByStep(QueryContext queryContext, Set<HanlpMapResult> existResults, Set<Long> detectDataSetIds,
|
||||
String detectSegment, int offset) {
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,49 @@
|
||||
package com.tencent.supersonic.headless.core.chat.parser;
|
||||
|
||||
|
||||
import com.tencent.supersonic.common.util.ContextUtils;
|
||||
import com.tencent.supersonic.headless.core.chat.parser.llm.SqlGeneration;
|
||||
import com.tencent.supersonic.headless.core.chat.parser.llm.SqlGenerationFactory;
|
||||
import com.tencent.supersonic.headless.core.chat.query.llm.s2sql.LLMReq;
|
||||
import com.tencent.supersonic.headless.core.chat.query.llm.s2sql.LLMReq.SqlGenerationMode;
|
||||
import com.tencent.supersonic.headless.core.chat.query.llm.s2sql.LLMResp;
|
||||
import com.tencent.supersonic.headless.core.pojo.QueryContext;
|
||||
import dev.langchain4j.model.chat.ChatLanguageModel;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import java.util.Objects;
|
||||
|
||||
/**
|
||||
* LLMProxy based on langchain4j Java version.
|
||||
*/
|
||||
@Slf4j
|
||||
@Component
|
||||
public class JavaLLMProxy implements LLMProxy {
|
||||
|
||||
private static final Logger keyPipelineLog = LoggerFactory.getLogger("keyPipeline");
|
||||
|
||||
@Override
|
||||
public boolean isSkip(QueryContext queryContext) {
|
||||
ChatLanguageModel chatLanguageModel = ContextUtils.getBean(ChatLanguageModel.class);
|
||||
if (Objects.isNull(chatLanguageModel)) {
|
||||
log.warn("chatLanguageModel is null, skip :{}", JavaLLMProxy.class.getName());
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public LLMResp query2sql(LLMReq llmReq, Long dataSetId) {
|
||||
|
||||
SqlGeneration sqlGeneration = SqlGenerationFactory.get(
|
||||
SqlGenerationMode.getMode(llmReq.getSqlGenerationMode()));
|
||||
String modelName = llmReq.getSchema().getDataSetName();
|
||||
LLMResp result = sqlGeneration.generation(llmReq, dataSetId);
|
||||
result.setQuery(llmReq.getQueryText());
|
||||
result.setModelName(modelName);
|
||||
return result;
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,19 @@
|
||||
package com.tencent.supersonic.headless.core.chat.parser;
|
||||
|
||||
|
||||
import com.tencent.supersonic.headless.core.pojo.QueryContext;
|
||||
import com.tencent.supersonic.headless.core.chat.query.llm.s2sql.LLMReq;
|
||||
import com.tencent.supersonic.headless.core.chat.query.llm.s2sql.LLMResp;
|
||||
|
||||
/**
|
||||
* LLMProxy encapsulates functions performed by LLMs so that multiple
|
||||
* orchestration frameworks (e.g. LangChain in python, LangChain4j in java)
|
||||
* could be used.
|
||||
*/
|
||||
public interface LLMProxy {
|
||||
|
||||
boolean isSkip(QueryContext queryContext);
|
||||
|
||||
LLMResp query2sql(LLMReq llmReq, Long dataSetId);
|
||||
|
||||
}
|
||||
@@ -0,0 +1,75 @@
|
||||
package com.tencent.supersonic.headless.core.chat.parser;
|
||||
|
||||
import com.tencent.supersonic.common.util.ContextUtils;
|
||||
import com.tencent.supersonic.common.util.JsonUtil;
|
||||
import com.tencent.supersonic.headless.core.config.LLMParserConfig;
|
||||
import com.tencent.supersonic.headless.core.chat.parser.llm.OutputFormat;
|
||||
import com.tencent.supersonic.headless.core.pojo.QueryContext;
|
||||
import com.tencent.supersonic.headless.core.chat.query.llm.s2sql.LLMReq;
|
||||
import com.tencent.supersonic.headless.core.chat.query.llm.s2sql.LLMResp;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.apache.commons.collections4.MapUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.http.HttpEntity;
|
||||
import org.springframework.http.HttpHeaders;
|
||||
import org.springframework.http.HttpMethod;
|
||||
import org.springframework.http.MediaType;
|
||||
import org.springframework.http.ResponseEntity;
|
||||
import org.springframework.stereotype.Component;
|
||||
import org.springframework.web.client.RestTemplate;
|
||||
|
||||
import java.net.URL;
|
||||
import java.util.ArrayList;
|
||||
|
||||
/**
|
||||
* PythonLLMProxy sends requests to LangChain-based python service.
|
||||
*/
|
||||
@Slf4j
|
||||
@Component
|
||||
public class PythonLLMProxy implements LLMProxy {
|
||||
|
||||
private static final Logger keyPipelineLog = LoggerFactory.getLogger("keyPipeline");
|
||||
|
||||
@Override
|
||||
public boolean isSkip(QueryContext queryContext) {
|
||||
LLMParserConfig llmParserConfig = ContextUtils.getBean(LLMParserConfig.class);
|
||||
if (StringUtils.isEmpty(llmParserConfig.getUrl())) {
|
||||
log.warn("llmParserUrl is empty, skip :{}", PythonLLMProxy.class.getName());
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public LLMResp query2sql(LLMReq llmReq, Long dataSetId) {
|
||||
long startTime = System.currentTimeMillis();
|
||||
log.info("requestLLM request, dataSetId:{},llmReq:{}", dataSetId, llmReq);
|
||||
keyPipelineLog.info("dataSetId:{},llmReq:{}", dataSetId, llmReq);
|
||||
try {
|
||||
LLMParserConfig llmParserConfig = ContextUtils.getBean(LLMParserConfig.class);
|
||||
|
||||
URL url = new URL(new URL(llmParserConfig.getUrl()), llmParserConfig.getQueryToSqlPath());
|
||||
HttpHeaders headers = new HttpHeaders();
|
||||
headers.setContentType(MediaType.APPLICATION_JSON);
|
||||
HttpEntity<String> entity = new HttpEntity<>(JsonUtil.toString(llmReq), headers);
|
||||
RestTemplate restTemplate = ContextUtils.getBean(RestTemplate.class);
|
||||
ResponseEntity<LLMResp> responseEntity = restTemplate.exchange(url.toString(), HttpMethod.POST, entity,
|
||||
LLMResp.class);
|
||||
|
||||
LLMResp llmResp = responseEntity.getBody();
|
||||
log.info("requestLLM response,cost:{}, questUrl:{} \n entity:{} \n body:{}",
|
||||
System.currentTimeMillis() - startTime, url, entity, llmResp);
|
||||
keyPipelineLog.info("LLMResp:{}", llmResp);
|
||||
|
||||
if (MapUtils.isEmpty(llmResp.getSqlRespMap())) {
|
||||
llmResp.setSqlRespMap(OutputFormat.buildSqlRespMap(new ArrayList<>(), llmResp.getSqlWeight()));
|
||||
}
|
||||
return llmResp;
|
||||
} catch (Exception e) {
|
||||
log.error("requestLLM error", e);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,33 @@
|
||||
package com.tencent.supersonic.headless.core.chat.parser;
|
||||
|
||||
import com.tencent.supersonic.auth.api.authentication.pojo.User;
|
||||
import com.tencent.supersonic.headless.api.pojo.SemanticParseInfo;
|
||||
import com.tencent.supersonic.headless.core.chat.query.SemanticQuery;
|
||||
import com.tencent.supersonic.headless.core.pojo.ChatContext;
|
||||
import com.tencent.supersonic.headless.core.pojo.QueryContext;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* QueryTypeParser resolves query type as either METRIC or TAG, or ID.
|
||||
*/
|
||||
@Slf4j
|
||||
public class QueryTypeParser implements SemanticParser {
|
||||
|
||||
@Override
|
||||
public void parse(QueryContext queryContext, ChatContext chatContext) {
|
||||
|
||||
List<SemanticQuery> candidateQueries = queryContext.getCandidateQueries();
|
||||
User user = queryContext.getUser();
|
||||
|
||||
for (SemanticQuery semanticQuery : candidateQueries) {
|
||||
// 1.init S2SQL
|
||||
semanticQuery.initS2Sql(queryContext.getSemanticSchema(), user);
|
||||
// 2.set queryType
|
||||
SemanticParseInfo parseInfo = semanticQuery.getParseInfo();
|
||||
parseInfo.setQueryType(queryContext.getQueryType(parseInfo.getDataSetId()));
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,48 @@
|
||||
package com.tencent.supersonic.headless.core.chat.parser;
|
||||
|
||||
import com.tencent.supersonic.common.util.ContextUtils;
|
||||
import com.tencent.supersonic.headless.api.pojo.SemanticParseInfo;
|
||||
import com.tencent.supersonic.headless.core.config.OptimizationConfig;
|
||||
import com.tencent.supersonic.headless.core.pojo.QueryContext;
|
||||
import com.tencent.supersonic.headless.core.chat.query.SemanticQuery;
|
||||
import com.tencent.supersonic.headless.core.chat.query.llm.s2sql.LLMSqlQuery;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
/**
|
||||
* This checker can be used by semantic parsers to check if query intent
|
||||
* has already been satisfied by current candidate queries. If so, current
|
||||
* parser could be skipped.
|
||||
*/
|
||||
@Slf4j
|
||||
public class SatisfactionChecker {
|
||||
|
||||
// check all the parse info in candidate
|
||||
public static boolean isSkip(QueryContext queryContext) {
|
||||
for (SemanticQuery query : queryContext.getCandidateQueries()) {
|
||||
if (query.getQueryMode().equals(LLMSqlQuery.QUERY_MODE)) {
|
||||
continue;
|
||||
}
|
||||
if (checkThreshold(queryContext.getQueryText(), query.getParseInfo())) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
private static boolean checkThreshold(String queryText, SemanticParseInfo semanticParseInfo) {
|
||||
int queryTextLength = queryText.replaceAll(" ", "").length();
|
||||
double degree = semanticParseInfo.getScore() / queryTextLength;
|
||||
OptimizationConfig optimizationConfig = ContextUtils.getBean(OptimizationConfig.class);
|
||||
if (queryTextLength > optimizationConfig.getQueryTextLengthThreshold()) {
|
||||
if (degree < optimizationConfig.getLongTextThreshold()) {
|
||||
return false;
|
||||
}
|
||||
} else if (degree < optimizationConfig.getShortTextThreshold()) {
|
||||
return false;
|
||||
}
|
||||
log.info("queryMode:{}, degree:{}, parse info:{}",
|
||||
semanticParseInfo.getQueryMode(), degree, semanticParseInfo);
|
||||
return true;
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,14 @@
|
||||
package com.tencent.supersonic.headless.core.chat.parser;
|
||||
|
||||
import com.tencent.supersonic.headless.core.pojo.ChatContext;
|
||||
import com.tencent.supersonic.headless.core.pojo.QueryContext;
|
||||
|
||||
/**
|
||||
* A semantic parser understands user queries and extracts semantic information.
|
||||
* It could leverage either rule-based or LLM-based approach to identify query intent
|
||||
* and extract related semantic items from the query.
|
||||
*/
|
||||
public interface SemanticParser {
|
||||
|
||||
void parse(QueryContext queryContext, ChatContext chatContext);
|
||||
}
|
||||
@@ -0,0 +1,9 @@
|
||||
package com.tencent.supersonic.headless.core.chat.parser.llm;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
public class DataSetMatchResult {
|
||||
private Integer count = 0;
|
||||
private double maxSimilarity;
|
||||
}
|
||||
@@ -0,0 +1,11 @@
|
||||
package com.tencent.supersonic.headless.core.chat.parser.llm;
|
||||
|
||||
import com.tencent.supersonic.headless.core.pojo.QueryContext;
|
||||
|
||||
import java.util.Set;
|
||||
|
||||
public interface DataSetResolver {
|
||||
|
||||
Long resolve(QueryContext queryContext, Set<Long> restrictiveModels);
|
||||
|
||||
}
|
||||
@@ -0,0 +1,130 @@
|
||||
package com.tencent.supersonic.headless.core.chat.parser.llm;
|
||||
|
||||
import com.tencent.supersonic.headless.api.pojo.SchemaElementMatch;
|
||||
import com.tencent.supersonic.headless.api.pojo.SchemaElementType;
|
||||
import com.tencent.supersonic.headless.api.pojo.SchemaMapInfo;
|
||||
import com.tencent.supersonic.headless.core.pojo.QueryContext;
|
||||
import com.tencent.supersonic.headless.core.chat.query.SemanticQuery;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.apache.commons.collections.CollectionUtils;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Map.Entry;
|
||||
import java.util.Objects;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@Slf4j
|
||||
public class HeuristicDataSetResolver implements DataSetResolver {
|
||||
|
||||
protected static Long selectDataSetBySchemaElementMatchScore(Map<Long, SemanticQuery> dataSetQueryModes,
|
||||
SchemaMapInfo schemaMap) {
|
||||
//dataSet count priority
|
||||
Long dataSetIdByDataSetCount = getDataSetIdByMatchDataSetScore(schemaMap);
|
||||
if (Objects.nonNull(dataSetIdByDataSetCount)) {
|
||||
log.info("selectDataSet by dataSet count:{}", dataSetIdByDataSetCount);
|
||||
return dataSetIdByDataSetCount;
|
||||
}
|
||||
|
||||
Map<Long, DataSetMatchResult> dataSetTypeMap = getDataSetTypeMap(schemaMap);
|
||||
if (dataSetTypeMap.size() == 1) {
|
||||
Long dataSetSelect = new ArrayList<>(dataSetTypeMap.entrySet()).get(0).getKey();
|
||||
if (dataSetQueryModes.containsKey(dataSetSelect)) {
|
||||
log.info("selectDataSet with only one DataSet [{}]", dataSetSelect);
|
||||
return dataSetSelect;
|
||||
}
|
||||
} else {
|
||||
Entry<Long, DataSetMatchResult> maxDataSet = dataSetTypeMap.entrySet().stream()
|
||||
.filter(entry -> dataSetQueryModes.containsKey(entry.getKey()))
|
||||
.sorted((o1, o2) -> {
|
||||
int difference = o2.getValue().getCount() - o1.getValue().getCount();
|
||||
if (difference == 0) {
|
||||
return (int) ((o2.getValue().getMaxSimilarity()
|
||||
- o1.getValue().getMaxSimilarity()) * 100);
|
||||
}
|
||||
return difference;
|
||||
}).findFirst().orElse(null);
|
||||
if (maxDataSet != null) {
|
||||
log.info("selectDataSet with multiple DataSets [{}]", maxDataSet.getKey());
|
||||
return maxDataSet.getKey();
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
private static Long getDataSetIdByMatchDataSetScore(SchemaMapInfo schemaMap) {
|
||||
Map<Long, List<SchemaElementMatch>> dataSetElementMatches = schemaMap.getDataSetElementMatches();
|
||||
// calculate dataSet match score, matched element gets 1.0 point, and inherit element gets 0.5 point
|
||||
Map<Long, Double> dataSetIdToDataSetScore = new HashMap<>();
|
||||
if (Objects.nonNull(dataSetElementMatches)) {
|
||||
for (Entry<Long, List<SchemaElementMatch>> dataSetElementMatch : dataSetElementMatches.entrySet()) {
|
||||
Long dataSetId = dataSetElementMatch.getKey();
|
||||
List<Double> dataSetMatchesScore = dataSetElementMatch.getValue().stream()
|
||||
.filter(elementMatch -> elementMatch.getSimilarity() >= 1)
|
||||
.filter(elementMatch -> SchemaElementType.DATASET.equals(elementMatch.getElement().getType()))
|
||||
.map(elementMatch -> elementMatch.isInherited() ? 0.5 : 1.0).collect(Collectors.toList());
|
||||
|
||||
if (!CollectionUtils.isEmpty(dataSetMatchesScore)) {
|
||||
// get sum of dataSet match score
|
||||
double score = dataSetMatchesScore.stream().mapToDouble(Double::doubleValue).sum();
|
||||
dataSetIdToDataSetScore.put(dataSetId, score);
|
||||
}
|
||||
}
|
||||
Entry<Long, Double> maxDataSetScore = dataSetIdToDataSetScore.entrySet().stream()
|
||||
.max(Comparator.comparingDouble(Entry::getValue)).orElse(null);
|
||||
log.info("maxDataSetCount:{},dataSetIdToDataSetCount:{}", maxDataSetScore, dataSetIdToDataSetScore);
|
||||
if (Objects.nonNull(maxDataSetScore)) {
|
||||
return maxDataSetScore.getKey();
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
public static Map<Long, DataSetMatchResult> getDataSetTypeMap(SchemaMapInfo schemaMap) {
|
||||
Map<Long, DataSetMatchResult> dataSetCount = new HashMap<>();
|
||||
for (Entry<Long, List<SchemaElementMatch>> entry : schemaMap.getDataSetElementMatches().entrySet()) {
|
||||
List<SchemaElementMatch> schemaElementMatches = schemaMap.getMatchedElements(entry.getKey());
|
||||
if (schemaElementMatches != null && schemaElementMatches.size() > 0) {
|
||||
if (!dataSetCount.containsKey(entry.getKey())) {
|
||||
dataSetCount.put(entry.getKey(), new DataSetMatchResult());
|
||||
}
|
||||
DataSetMatchResult dataSetMatchResult = dataSetCount.get(entry.getKey());
|
||||
Set<SchemaElementType> schemaElementTypes = new HashSet<>();
|
||||
schemaElementMatches.stream()
|
||||
.forEach(schemaElementMatch -> schemaElementTypes.add(
|
||||
schemaElementMatch.getElement().getType()));
|
||||
SchemaElementMatch schemaElementMatchMax = schemaElementMatches.stream()
|
||||
.sorted((o1, o2) ->
|
||||
((int) ((o2.getSimilarity() - o1.getSimilarity()) * 100))
|
||||
).findFirst().orElse(null);
|
||||
if (schemaElementMatchMax != null) {
|
||||
dataSetMatchResult.setMaxSimilarity(schemaElementMatchMax.getSimilarity());
|
||||
}
|
||||
dataSetMatchResult.setCount(schemaElementTypes.size());
|
||||
|
||||
}
|
||||
}
|
||||
return dataSetCount;
|
||||
}
|
||||
|
||||
public Long resolve(QueryContext queryContext, Set<Long> agentDataSetIds) {
|
||||
SchemaMapInfo mapInfo = queryContext.getMapInfo();
|
||||
Set<Long> matchedDataSets = mapInfo.getMatchedDataSetInfos();
|
||||
if (CollectionUtils.isNotEmpty(agentDataSetIds)) {
|
||||
matchedDataSets.retainAll(agentDataSetIds);
|
||||
}
|
||||
Map<Long, SemanticQuery> dataSetQueryModes = new HashMap<>();
|
||||
for (Long dataSetIds : matchedDataSets) {
|
||||
dataSetQueryModes.put(dataSetIds, null);
|
||||
}
|
||||
if (dataSetQueryModes.size() == 1) {
|
||||
return dataSetQueryModes.keySet().stream().findFirst().get();
|
||||
}
|
||||
return selectDataSetBySchemaElementMatchScore(dataSetQueryModes, mapInfo);
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,43 @@
|
||||
package com.tencent.supersonic.headless.core.chat.parser.llm;
|
||||
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
@Slf4j
|
||||
public class InputFormat {
|
||||
|
||||
public static final String SEPERATOR = "\n\n";
|
||||
|
||||
public static String format(String template, List<String> templateKey,
|
||||
List<Map<String, String>> toFormatList) {
|
||||
List<String> result = new ArrayList<>();
|
||||
|
||||
for (Map<String, String> formatItem : toFormatList) {
|
||||
Map<String, String> retrievalMeta = subDict(formatItem, templateKey);
|
||||
result.add(format(template, retrievalMeta));
|
||||
}
|
||||
|
||||
return String.join(SEPERATOR, result);
|
||||
}
|
||||
|
||||
public static String format(String input, Map<String, String> replacements) {
|
||||
for (Map.Entry<String, String> entry : replacements.entrySet()) {
|
||||
input = input.replace(entry.getKey(), entry.getValue());
|
||||
}
|
||||
return input;
|
||||
}
|
||||
|
||||
private static Map<String, String> subDict(Map<String, String> dict, List<String> keys) {
|
||||
Map<String, String> subDict = new HashMap<>();
|
||||
for (String key : keys) {
|
||||
if (dict.containsKey(key)) {
|
||||
subDict.put(key, dict.get(key));
|
||||
}
|
||||
}
|
||||
return subDict;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,240 @@
|
||||
package com.tencent.supersonic.headless.core.chat.parser.llm;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
import com.tencent.supersonic.common.pojo.enums.DataFormatTypeEnum;
|
||||
import com.tencent.supersonic.common.pojo.enums.QueryType;
|
||||
import com.tencent.supersonic.common.pojo.enums.TimeDimensionEnum;
|
||||
import com.tencent.supersonic.common.util.DateUtils;
|
||||
import com.tencent.supersonic.headless.api.pojo.SchemaElement;
|
||||
import com.tencent.supersonic.headless.api.pojo.SchemaElementMatch;
|
||||
import com.tencent.supersonic.headless.api.pojo.SchemaElementType;
|
||||
import com.tencent.supersonic.headless.api.pojo.SchemaItem;
|
||||
import com.tencent.supersonic.headless.api.pojo.SemanticSchema;
|
||||
import com.tencent.supersonic.headless.api.pojo.response.DataSetSchemaResp;
|
||||
import com.tencent.supersonic.headless.core.chat.parser.SatisfactionChecker;
|
||||
import com.tencent.supersonic.headless.core.chat.query.llm.s2sql.LLMReq;
|
||||
import com.tencent.supersonic.headless.core.chat.query.llm.s2sql.LLMReq.ElementValue;
|
||||
import com.tencent.supersonic.headless.core.chat.query.llm.s2sql.LLMResp;
|
||||
import com.tencent.supersonic.headless.core.config.LLMParserConfig;
|
||||
import com.tencent.supersonic.headless.core.config.OptimizationConfig;
|
||||
import com.tencent.supersonic.headless.core.pojo.QueryContext;
|
||||
import com.tencent.supersonic.headless.core.utils.ComponentFactory;
|
||||
import com.tencent.supersonic.headless.core.utils.S2SqlDateHelper;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.commons.lang3.tuple.Pair;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.stereotype.Service;
|
||||
import org.springframework.util.CollectionUtils;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
public class LLMRequestService {
|
||||
@Autowired
|
||||
private LLMParserConfig llmParserConfig;
|
||||
@Autowired
|
||||
private OptimizationConfig optimizationConfig;
|
||||
|
||||
public boolean isSkip(QueryContext queryCtx) {
|
||||
if (!queryCtx.isEnableLLM()) {
|
||||
log.info("not enable llm, skip");
|
||||
return true;
|
||||
}
|
||||
if (ComponentFactory.getLLMProxy().isSkip(queryCtx)) {
|
||||
return true;
|
||||
}
|
||||
if (SatisfactionChecker.isSkip(queryCtx)) {
|
||||
log.info("skip {}, queryText:{}", LLMSqlParser.class, queryCtx.getQueryText());
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public Long getDataSetId(QueryContext queryCtx) {
|
||||
DataSetResolver dataSetResolver = ComponentFactory.getModelResolver();
|
||||
return dataSetResolver.resolve(queryCtx, queryCtx.getDataSetIds());
|
||||
}
|
||||
|
||||
public LLMReq getLlmReq(QueryContext queryCtx, Long dataSetId,
|
||||
SemanticSchema semanticSchema, List<LLMReq.ElementValue> linkingValues) {
|
||||
Map<Long, String> dataSetIdToName = semanticSchema.getDataSetIdToName();
|
||||
String queryText = queryCtx.getQueryText();
|
||||
|
||||
LLMReq llmReq = new LLMReq();
|
||||
llmReq.setQueryText(queryText);
|
||||
LLMReq.FilterCondition filterCondition = new LLMReq.FilterCondition();
|
||||
llmReq.setFilterCondition(filterCondition);
|
||||
|
||||
LLMReq.LLMSchema llmSchema = new LLMReq.LLMSchema();
|
||||
llmSchema.setDataSetName(dataSetIdToName.get(dataSetId));
|
||||
llmSchema.setDomainName(dataSetIdToName.get(dataSetId));
|
||||
|
||||
List<String> fieldNameList = getFieldNameList(queryCtx, dataSetId, llmParserConfig);
|
||||
|
||||
String priorExts = getPriorExts(dataSetId, fieldNameList);
|
||||
llmReq.setPriorExts(priorExts);
|
||||
|
||||
fieldNameList.add(TimeDimensionEnum.DAY.getChName());
|
||||
llmSchema.setFieldNameList(fieldNameList);
|
||||
llmReq.setSchema(llmSchema);
|
||||
|
||||
List<ElementValue> linking = new ArrayList<>();
|
||||
if (optimizationConfig.isUseLinkingValueSwitch()) {
|
||||
linking.addAll(linkingValues);
|
||||
}
|
||||
llmReq.setLinking(linking);
|
||||
|
||||
String currentDate = S2SqlDateHelper.getReferenceDate(queryCtx, dataSetId);
|
||||
if (StringUtils.isEmpty(currentDate)) {
|
||||
currentDate = DateUtils.getBeforeDate(0);
|
||||
}
|
||||
llmReq.setCurrentDate(currentDate);
|
||||
llmReq.setSqlGenerationMode(optimizationConfig.getSqlGenerationMode().getName());
|
||||
return llmReq;
|
||||
}
|
||||
|
||||
public LLMResp requestLLM(LLMReq llmReq, Long dataSetId) {
|
||||
return ComponentFactory.getLLMProxy().query2sql(llmReq, dataSetId);
|
||||
}
|
||||
|
||||
protected List<String> getFieldNameList(QueryContext queryCtx, Long dataSetId,
|
||||
LLMParserConfig llmParserConfig) {
|
||||
|
||||
Set<String> results = getTopNFieldNames(queryCtx, dataSetId, llmParserConfig);
|
||||
|
||||
Set<String> fieldNameList = getMatchedFieldNames(queryCtx, dataSetId);
|
||||
|
||||
results.addAll(fieldNameList);
|
||||
return new ArrayList<>(results);
|
||||
}
|
||||
|
||||
private String getPriorExts(Long dataSetId, List<String> fieldNameList) {
|
||||
StringBuilder extraInfoSb = new StringBuilder();
|
||||
//todo
|
||||
List<DataSetSchemaResp> dataSetSchemaResps = Lists.newArrayList();
|
||||
if (!CollectionUtils.isEmpty(dataSetSchemaResps)) {
|
||||
DataSetSchemaResp dataSetSchemaResp = dataSetSchemaResps.get(0);
|
||||
Map<String, String> fieldNameToDataFormatType = dataSetSchemaResp.getMetrics()
|
||||
.stream().filter(metricSchemaResp -> Objects.nonNull(metricSchemaResp.getDataFormatType()))
|
||||
.flatMap(metricSchemaResp -> {
|
||||
Set<Pair<String, String>> result = new HashSet<>();
|
||||
String dataFormatType = metricSchemaResp.getDataFormatType();
|
||||
result.add(Pair.of(metricSchemaResp.getName(), dataFormatType));
|
||||
List<String> aliasList = SchemaItem.getAliasList(metricSchemaResp.getAlias());
|
||||
if (!CollectionUtils.isEmpty(aliasList)) {
|
||||
for (String alias : aliasList) {
|
||||
result.add(Pair.of(alias, dataFormatType));
|
||||
}
|
||||
}
|
||||
return result.stream();
|
||||
}).collect(Collectors.toMap(Pair::getLeft, Pair::getRight, (k1, k2) -> k1));
|
||||
|
||||
for (String fieldName : fieldNameList) {
|
||||
String dataFormatType = fieldNameToDataFormatType.get(fieldName);
|
||||
if (DataFormatTypeEnum.DECIMAL.getName().equalsIgnoreCase(dataFormatType)
|
||||
|| DataFormatTypeEnum.PERCENT.getName().equalsIgnoreCase(dataFormatType)) {
|
||||
String format = String.format("%s的计量单位是%s", fieldName, "小数; ");
|
||||
extraInfoSb.append(format);
|
||||
}
|
||||
}
|
||||
}
|
||||
return extraInfoSb.toString();
|
||||
}
|
||||
|
||||
protected List<ElementValue> getValueList(QueryContext queryCtx, Long dataSetId) {
|
||||
Map<Long, String> itemIdToName = getItemIdToName(queryCtx, dataSetId);
|
||||
List<SchemaElementMatch> matchedElements = queryCtx.getMapInfo().getMatchedElements(dataSetId);
|
||||
if (CollectionUtils.isEmpty(matchedElements)) {
|
||||
return new ArrayList<>();
|
||||
}
|
||||
Set<ElementValue> valueMatches = matchedElements
|
||||
.stream()
|
||||
.filter(elementMatch -> !elementMatch.isInherited())
|
||||
.filter(schemaElementMatch -> {
|
||||
SchemaElementType type = schemaElementMatch.getElement().getType();
|
||||
return SchemaElementType.VALUE.equals(type) || SchemaElementType.TAG_VALUE.equals(type)
|
||||
|| SchemaElementType.ID.equals(type);
|
||||
})
|
||||
.map(elementMatch -> {
|
||||
ElementValue elementValue = new ElementValue();
|
||||
elementValue.setFieldName(itemIdToName.get(elementMatch.getElement().getId()));
|
||||
elementValue.setFieldValue(elementMatch.getWord());
|
||||
return elementValue;
|
||||
}).collect(Collectors.toSet());
|
||||
return new ArrayList<>(valueMatches);
|
||||
}
|
||||
|
||||
protected Map<Long, String> getItemIdToName(QueryContext queryCtx, Long dataSetId) {
|
||||
SemanticSchema semanticSchema = queryCtx.getSemanticSchema();
|
||||
List<SchemaElement> elements = semanticSchema.getDimensions(dataSetId);
|
||||
if (QueryType.TAG.equals(queryCtx.getQueryType(dataSetId))) {
|
||||
elements = semanticSchema.getTags(dataSetId);
|
||||
}
|
||||
return elements.stream()
|
||||
.collect(Collectors.toMap(SchemaElement::getId, SchemaElement::getName, (value1, value2) -> value2));
|
||||
}
|
||||
|
||||
private Set<String> getTopNFieldNames(QueryContext queryCtx, Long dataSetId, LLMParserConfig llmParserConfig) {
|
||||
SemanticSchema semanticSchema = queryCtx.getSemanticSchema();
|
||||
Set<String> results = new HashSet<>();
|
||||
if (QueryType.TAG.equals(queryCtx.getQueryType(dataSetId))) {
|
||||
Set<String> tags = semanticSchema.getTags(dataSetId).stream()
|
||||
.sorted(Comparator.comparing(SchemaElement::getUseCnt).reversed())
|
||||
.limit(llmParserConfig.getDimensionTopN())
|
||||
.map(entry -> entry.getName())
|
||||
.collect(Collectors.toSet());
|
||||
results.addAll(tags);
|
||||
} else {
|
||||
Set<String> dimensions = semanticSchema.getDimensions(dataSetId).stream()
|
||||
.sorted(Comparator.comparing(SchemaElement::getUseCnt).reversed())
|
||||
.limit(llmParserConfig.getDimensionTopN())
|
||||
.map(entry -> entry.getName())
|
||||
.collect(Collectors.toSet());
|
||||
results.addAll(dimensions);
|
||||
Set<String> metrics = semanticSchema.getMetrics(dataSetId).stream()
|
||||
.sorted(Comparator.comparing(SchemaElement::getUseCnt).reversed())
|
||||
.limit(llmParserConfig.getMetricTopN())
|
||||
.map(entry -> entry.getName())
|
||||
.collect(Collectors.toSet());
|
||||
results.addAll(metrics);
|
||||
}
|
||||
return results;
|
||||
}
|
||||
|
||||
protected Set<String> getMatchedFieldNames(QueryContext queryCtx, Long dataSetId) {
|
||||
Map<Long, String> itemIdToName = getItemIdToName(queryCtx, dataSetId);
|
||||
List<SchemaElementMatch> matchedElements = queryCtx.getMapInfo().getMatchedElements(dataSetId);
|
||||
if (CollectionUtils.isEmpty(matchedElements)) {
|
||||
return new HashSet<>();
|
||||
}
|
||||
Set<String> fieldNameList = matchedElements.stream()
|
||||
.filter(schemaElementMatch -> {
|
||||
SchemaElementType elementType = schemaElementMatch.getElement().getType();
|
||||
return SchemaElementType.METRIC.equals(elementType)
|
||||
|| SchemaElementType.DIMENSION.equals(elementType)
|
||||
|| SchemaElementType.VALUE.equals(elementType)
|
||||
|| SchemaElementType.TAG.equals(elementType)
|
||||
|| SchemaElementType.TAG_VALUE.equals(elementType);
|
||||
})
|
||||
.map(schemaElementMatch -> {
|
||||
SchemaElement element = schemaElementMatch.getElement();
|
||||
SchemaElementType elementType = element.getType();
|
||||
if (SchemaElementType.VALUE.equals(elementType) || SchemaElementType.TAG_VALUE.equals(
|
||||
elementType)) {
|
||||
return itemIdToName.get(element.getId());
|
||||
}
|
||||
return schemaElementMatch.getWord();
|
||||
})
|
||||
.collect(Collectors.toSet());
|
||||
return fieldNameList;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,58 @@
|
||||
package com.tencent.supersonic.headless.core.chat.parser.llm;
|
||||
|
||||
import com.tencent.supersonic.common.pojo.Constants;
|
||||
import com.tencent.supersonic.common.util.jsqlparser.SqlEqualHelper;
|
||||
import com.tencent.supersonic.headless.api.pojo.SemanticParseInfo;
|
||||
import com.tencent.supersonic.headless.core.pojo.QueryContext;
|
||||
import com.tencent.supersonic.headless.core.chat.query.QueryManager;
|
||||
import com.tencent.supersonic.headless.core.chat.query.llm.LLMSemanticQuery;
|
||||
import com.tencent.supersonic.headless.core.chat.query.llm.s2sql.LLMResp;
|
||||
import com.tencent.supersonic.headless.core.chat.query.llm.s2sql.LLMSqlQuery;
|
||||
import com.tencent.supersonic.headless.core.chat.query.llm.s2sql.LLMSqlResp;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.apache.commons.collections4.MapUtils;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
public class LLMResponseService {
|
||||
|
||||
public SemanticParseInfo addParseInfo(QueryContext queryCtx, ParseResult parseResult, String s2SQL, Double weight) {
|
||||
if (Objects.isNull(weight)) {
|
||||
weight = 0D;
|
||||
}
|
||||
LLMSemanticQuery semanticQuery = QueryManager.createLLMQuery(LLMSqlQuery.QUERY_MODE);
|
||||
SemanticParseInfo parseInfo = semanticQuery.getParseInfo();
|
||||
parseInfo.setDataSet(queryCtx.getSemanticSchema().getDataSet(parseResult.getDataSetId()));
|
||||
parseInfo.getElementMatches().addAll(queryCtx.getMapInfo().getMatchedElements(parseInfo.getDataSetId()));
|
||||
|
||||
Map<String, Object> properties = new HashMap<>();
|
||||
properties.put(Constants.CONTEXT, parseResult);
|
||||
properties.put("type", "internal");
|
||||
parseInfo.setProperties(properties);
|
||||
parseInfo.setScore(queryCtx.getQueryText().length() * (1 + weight));
|
||||
parseInfo.setQueryMode(semanticQuery.getQueryMode());
|
||||
parseInfo.getSqlInfo().setS2SQL(s2SQL);
|
||||
queryCtx.getCandidateQueries().add(semanticQuery);
|
||||
return parseInfo;
|
||||
}
|
||||
|
||||
public Map<String, LLMSqlResp> getDeduplicationSqlResp(LLMResp llmResp) {
|
||||
if (MapUtils.isEmpty(llmResp.getSqlRespMap())) {
|
||||
return llmResp.getSqlRespMap();
|
||||
}
|
||||
Map<String, LLMSqlResp> result = new HashMap<>();
|
||||
for (Map.Entry<String, LLMSqlResp> entry : llmResp.getSqlRespMap().entrySet()) {
|
||||
String key = entry.getKey();
|
||||
if (result.keySet().stream().anyMatch(existKey -> SqlEqualHelper.equals(existKey, key))) {
|
||||
continue;
|
||||
}
|
||||
result.put(key, entry.getValue());
|
||||
}
|
||||
return result;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,66 @@
|
||||
package com.tencent.supersonic.headless.core.chat.parser.llm;
|
||||
|
||||
|
||||
import com.tencent.supersonic.common.util.ContextUtils;
|
||||
import com.tencent.supersonic.headless.api.pojo.SemanticSchema;
|
||||
import com.tencent.supersonic.headless.core.chat.parser.SemanticParser;
|
||||
import com.tencent.supersonic.headless.core.pojo.ChatContext;
|
||||
import com.tencent.supersonic.headless.core.pojo.QueryContext;
|
||||
import com.tencent.supersonic.headless.core.chat.query.llm.s2sql.LLMReq;
|
||||
import com.tencent.supersonic.headless.core.chat.query.llm.s2sql.LLMResp;
|
||||
import com.tencent.supersonic.headless.core.chat.query.llm.s2sql.LLMSqlResp;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.apache.commons.collections4.MapUtils;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
|
||||
@Slf4j
|
||||
public class LLMSqlParser implements SemanticParser {
|
||||
|
||||
@Override
|
||||
public void parse(QueryContext queryCtx, ChatContext chatCtx) {
|
||||
LLMRequestService requestService = ContextUtils.getBean(LLMRequestService.class);
|
||||
//1.determine whether to skip this parser.
|
||||
if (requestService.isSkip(queryCtx)) {
|
||||
return;
|
||||
}
|
||||
try {
|
||||
//2.get dataSetId from queryCtx and chatCtx.
|
||||
Long dataSetId = requestService.getDataSetId(queryCtx);
|
||||
if (dataSetId == null) {
|
||||
return;
|
||||
}
|
||||
//3.construct a request, call the API for the large model, and retrieve the results.
|
||||
List<LLMReq.ElementValue> linkingValues = requestService.getValueList(queryCtx, dataSetId);
|
||||
SemanticSchema semanticSchema = queryCtx.getSemanticSchema();
|
||||
LLMReq llmReq = requestService.getLlmReq(queryCtx, dataSetId, semanticSchema, linkingValues);
|
||||
LLMResp llmResp = requestService.requestLLM(llmReq, dataSetId);
|
||||
|
||||
if (Objects.isNull(llmResp)) {
|
||||
return;
|
||||
}
|
||||
//4. deduplicate the SQL result list and build parserInfo
|
||||
LLMResponseService responseService = ContextUtils.getBean(LLMResponseService.class);
|
||||
Map<String, LLMSqlResp> deduplicationSqlResp = responseService.getDeduplicationSqlResp(llmResp);
|
||||
ParseResult parseResult = ParseResult.builder()
|
||||
.dataSetId(dataSetId)
|
||||
.llmReq(llmReq)
|
||||
.llmResp(llmResp)
|
||||
.linkingValues(linkingValues)
|
||||
.build();
|
||||
|
||||
if (MapUtils.isEmpty(deduplicationSqlResp)) {
|
||||
responseService.addParseInfo(queryCtx, parseResult, llmResp.getSqlOutput(), 1D);
|
||||
} else {
|
||||
deduplicationSqlResp.forEach((sql, sqlResp) -> {
|
||||
responseService.addParseInfo(queryCtx, parseResult, sql, sqlResp.getSqlWeight());
|
||||
});
|
||||
}
|
||||
|
||||
} catch (Exception e) {
|
||||
log.error("parse", e);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,89 @@
|
||||
package com.tencent.supersonic.headless.core.chat.parser.llm;
|
||||
|
||||
import com.tencent.supersonic.common.util.JsonUtil;
|
||||
import com.tencent.supersonic.headless.core.config.OptimizationConfig;
|
||||
import com.tencent.supersonic.headless.core.chat.query.llm.s2sql.LLMReq;
|
||||
import com.tencent.supersonic.headless.core.chat.query.llm.s2sql.LLMReq.SqlGenerationMode;
|
||||
import com.tencent.supersonic.headless.core.chat.query.llm.s2sql.LLMResp;
|
||||
import dev.langchain4j.data.message.AiMessage;
|
||||
import dev.langchain4j.model.chat.ChatLanguageModel;
|
||||
import dev.langchain4j.model.input.Prompt;
|
||||
import dev.langchain4j.model.input.PromptTemplate;
|
||||
import dev.langchain4j.model.output.Response;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.apache.commons.lang3.tuple.Pair;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.beans.factory.InitializingBean;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.CopyOnWriteArrayList;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@Service
|
||||
@Slf4j
|
||||
public class OnePassSCSqlGeneration implements SqlGeneration, InitializingBean {
|
||||
|
||||
private static final Logger keyPipelineLog = LoggerFactory.getLogger("keyPipeline");
|
||||
@Autowired
|
||||
private ChatLanguageModel chatLanguageModel;
|
||||
|
||||
@Autowired
|
||||
private SqlExamplarLoader sqlExamplarLoader;
|
||||
|
||||
@Autowired
|
||||
private OptimizationConfig optimizationConfig;
|
||||
|
||||
@Autowired
|
||||
private SqlPromptGenerator sqlPromptGenerator;
|
||||
|
||||
@Override
|
||||
public LLMResp generation(LLMReq llmReq, Long dataSetId) {
|
||||
//1.retriever sqlExamples and generate exampleListPool
|
||||
keyPipelineLog.info("dataSetId:{},llmReq:{}", dataSetId, llmReq);
|
||||
|
||||
List<Map<String, String>> sqlExamples = sqlExamplarLoader.retrieverSqlExamples(llmReq.getQueryText(),
|
||||
optimizationConfig.getText2sqlExampleNum());
|
||||
|
||||
List<List<Map<String, String>>> exampleListPool = sqlPromptGenerator.getExampleCombos(sqlExamples,
|
||||
optimizationConfig.getText2sqlFewShotsNum(), optimizationConfig.getText2sqlSelfConsistencyNum());
|
||||
|
||||
//2.generator linking and sql prompt by sqlExamples,and parallel generate response.
|
||||
List<String> linkingSqlPromptPool = sqlPromptGenerator.generatePromptPool(llmReq, exampleListPool, true);
|
||||
List<String> llmResults = new CopyOnWriteArrayList<>();
|
||||
linkingSqlPromptPool.parallelStream().forEach(linkingSqlPrompt -> {
|
||||
Prompt prompt = PromptTemplate.from(JsonUtil.toString(linkingSqlPrompt))
|
||||
.apply(new HashMap<>());
|
||||
keyPipelineLog.info("request prompt:{}", prompt.toSystemMessage());
|
||||
Response<AiMessage> response = chatLanguageModel.generate(prompt.toSystemMessage());
|
||||
String result = response.content().text();
|
||||
llmResults.add(result);
|
||||
keyPipelineLog.info("model response:{}", result);
|
||||
}
|
||||
);
|
||||
//3.format response.
|
||||
List<String> schemaLinkingResults = llmResults.stream()
|
||||
.map(llmResult -> OutputFormat.getSchemaLinks(llmResult)).collect(Collectors.toList());
|
||||
List<String> candidateSortedList = OutputFormat.formatList(schemaLinkingResults);
|
||||
Pair<String, Map<String, Double>> linkingMap = OutputFormat.selfConsistencyVote(candidateSortedList);
|
||||
List<String> sqlList = llmResults.stream()
|
||||
.map(llmResult -> OutputFormat.getSql(llmResult)).collect(Collectors.toList());
|
||||
|
||||
Pair<String, Map<String, Double>> sqlMapPair = OutputFormat.selfConsistencyVote(sqlList);
|
||||
keyPipelineLog.info("linkingMap:{} sqlMap:{}", linkingMap, sqlMapPair.getRight());
|
||||
|
||||
LLMResp result = new LLMResp();
|
||||
result.setQuery(llmReq.getQueryText());
|
||||
result.setSqlRespMap(OutputFormat.buildSqlRespMap(sqlExamples, sqlMapPair.getRight()));
|
||||
return result;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void afterPropertiesSet() {
|
||||
SqlGenerationFactory.addSqlGenerationForFactory(SqlGenerationMode.ONE_PASS_AUTO_COT_SELF_CONSISTENCY, this);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,74 @@
|
||||
package com.tencent.supersonic.headless.core.chat.parser.llm;
|
||||
|
||||
import com.tencent.supersonic.common.util.JsonUtil;
|
||||
import com.tencent.supersonic.headless.core.config.OptimizationConfig;
|
||||
import com.tencent.supersonic.headless.core.chat.query.llm.s2sql.LLMReq;
|
||||
import com.tencent.supersonic.headless.core.chat.query.llm.s2sql.LLMReq.SqlGenerationMode;
|
||||
import com.tencent.supersonic.headless.core.chat.query.llm.s2sql.LLMResp;
|
||||
import com.tencent.supersonic.headless.core.chat.query.llm.s2sql.LLMSqlResp;
|
||||
import dev.langchain4j.data.message.AiMessage;
|
||||
import dev.langchain4j.model.chat.ChatLanguageModel;
|
||||
import dev.langchain4j.model.input.Prompt;
|
||||
import dev.langchain4j.model.input.PromptTemplate;
|
||||
import dev.langchain4j.model.output.Response;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.beans.factory.InitializingBean;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
@Service
|
||||
@Slf4j
|
||||
public class OnePassSqlGeneration implements SqlGeneration, InitializingBean {
|
||||
|
||||
private static final Logger keyPipelineLog = LoggerFactory.getLogger("keyPipeline");
|
||||
@Autowired
|
||||
private ChatLanguageModel chatLanguageModel;
|
||||
|
||||
@Autowired
|
||||
private SqlExamplarLoader sqlExampleLoader;
|
||||
|
||||
@Autowired
|
||||
private OptimizationConfig optimizationConfig;
|
||||
|
||||
@Autowired
|
||||
private SqlPromptGenerator sqlPromptGenerator;
|
||||
|
||||
@Override
|
||||
public LLMResp generation(LLMReq llmReq, Long dataSetId) {
|
||||
//1.retriever sqlExamples
|
||||
keyPipelineLog.info("dataSetId:{},llmReq:{}", dataSetId, llmReq);
|
||||
List<Map<String, String>> sqlExamples = sqlExampleLoader.retrieverSqlExamples(llmReq.getQueryText(),
|
||||
optimizationConfig.getText2sqlExampleNum());
|
||||
|
||||
//2.generator linking and sql prompt by sqlExamples,and generate response.
|
||||
String promptStr = sqlPromptGenerator.generatorLinkingAndSqlPrompt(llmReq, sqlExamples);
|
||||
|
||||
Prompt prompt = PromptTemplate.from(JsonUtil.toString(promptStr)).apply(new HashMap<>());
|
||||
keyPipelineLog.info("request prompt:{}", prompt.toSystemMessage());
|
||||
Response<AiMessage> response = chatLanguageModel.generate(prompt.toSystemMessage());
|
||||
String result = response.content().text();
|
||||
keyPipelineLog.info("model response:{}", result);
|
||||
//3.format response.
|
||||
String schemaLinkStr = OutputFormat.getSchemaLinks(response.content().text());
|
||||
String sql = OutputFormat.getSql(response.content().text());
|
||||
Map<String, LLMSqlResp> sqlRespMap = new HashMap<>();
|
||||
sqlRespMap.put(sql, LLMSqlResp.builder().sqlWeight(1D).fewShots(sqlExamples).build());
|
||||
keyPipelineLog.info("schemaLinkStr:{},sqlRespMap:{}", schemaLinkStr, sqlRespMap);
|
||||
|
||||
LLMResp llmResp = new LLMResp();
|
||||
llmResp.setQuery(llmReq.getQueryText());
|
||||
llmResp.setSqlRespMap(sqlRespMap);
|
||||
return llmResp;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void afterPropertiesSet() {
|
||||
SqlGenerationFactory.addSqlGenerationForFactory(SqlGenerationMode.ONE_PASS_AUTO_COT, this);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,123 @@
|
||||
package com.tencent.supersonic.headless.core.chat.parser.llm;
|
||||
|
||||
import com.tencent.supersonic.headless.core.chat.query.llm.s2sql.LLMSqlResp;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.apache.commons.lang3.tuple.Pair;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/***
|
||||
* output format
|
||||
*/
|
||||
@Slf4j
|
||||
public class OutputFormat {
|
||||
|
||||
public static String getSchemaLink(String schemaLink) {
|
||||
String reult = "";
|
||||
try {
|
||||
reult = schemaLink.trim();
|
||||
String pattern = "Schema_links:(.*)";
|
||||
Pattern regexPattern = Pattern.compile(pattern, Pattern.DOTALL);
|
||||
Matcher matcher = regexPattern.matcher(reult);
|
||||
if (matcher.find()) {
|
||||
return matcher.group(1).trim();
|
||||
}
|
||||
} catch (Exception e) {
|
||||
log.error("", e);
|
||||
}
|
||||
return reult;
|
||||
}
|
||||
|
||||
public static String getSql(String sqlOutput) {
|
||||
String sql = "";
|
||||
try {
|
||||
sqlOutput = sqlOutput.trim();
|
||||
String pattern = "SQL:(.*)";
|
||||
Pattern regexPattern = Pattern.compile(pattern);
|
||||
Matcher matcher = regexPattern.matcher(sqlOutput);
|
||||
if (matcher.find()) {
|
||||
return matcher.group(1);
|
||||
}
|
||||
} catch (Exception e) {
|
||||
log.error("", e);
|
||||
}
|
||||
return sql;
|
||||
}
|
||||
|
||||
public static String getSchemaLinks(String text) {
|
||||
String schemaLinks = "";
|
||||
try {
|
||||
text = text.trim();
|
||||
String pattern = "Schema_links:(\\[.*?\\])|Schema_links: (\\[.*?\\])";
|
||||
Pattern regexPattern = Pattern.compile(pattern);
|
||||
Matcher matcher = regexPattern.matcher(text);
|
||||
|
||||
if (matcher.find()) {
|
||||
if (matcher.group(1) != null) {
|
||||
schemaLinks = matcher.group(1);
|
||||
} else if (matcher.group(2) != null) {
|
||||
schemaLinks = matcher.group(2);
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
log.error("", e);
|
||||
}
|
||||
|
||||
return schemaLinks;
|
||||
}
|
||||
|
||||
public static Pair<String, Map<String, Double>> selfConsistencyVote(List<String> inputList) {
|
||||
Map<String, Integer> inputCounts = new HashMap<>();
|
||||
for (String input : inputList) {
|
||||
inputCounts.put(input, inputCounts.getOrDefault(input, 0) + 1);
|
||||
}
|
||||
|
||||
String inputMax = null;
|
||||
int maxCount = 0;
|
||||
int inputSize = inputList.size();
|
||||
Map<String, Double> votePercentage = new HashMap<>();
|
||||
for (Map.Entry<String, Integer> entry : inputCounts.entrySet()) {
|
||||
String input = entry.getKey();
|
||||
int count = entry.getValue();
|
||||
if (count > maxCount) {
|
||||
inputMax = input;
|
||||
maxCount = count;
|
||||
}
|
||||
double percentage = (double) count / inputSize;
|
||||
votePercentage.put(input, percentage);
|
||||
}
|
||||
return Pair.of(inputMax, votePercentage);
|
||||
}
|
||||
|
||||
public static List<String> formatList(List<String> toFormatList) {
|
||||
List<String> results = new ArrayList<>();
|
||||
for (String toFormat : toFormatList) {
|
||||
List<String> items = new ArrayList<>();
|
||||
String[] split = toFormat.replace("[", "").replace("]", "").split(",");
|
||||
for (String item : split) {
|
||||
items.add(item.trim());
|
||||
}
|
||||
Collections.sort(items);
|
||||
String result = "[" + String.join(",", items) + "]";
|
||||
results.add(result);
|
||||
}
|
||||
return results;
|
||||
}
|
||||
|
||||
public static Map<String, LLMSqlResp> buildSqlRespMap(List<Map<String, String>> sqlExamples,
|
||||
Map<String, Double> sqlMap) {
|
||||
return sqlMap.entrySet().stream()
|
||||
.collect(Collectors.toMap(
|
||||
Map.Entry::getKey,
|
||||
entry -> LLMSqlResp.builder().sqlWeight(entry.getValue()).fewShots(sqlExamples).build())
|
||||
);
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,29 @@
|
||||
package com.tencent.supersonic.headless.core.chat.parser.llm;
|
||||
|
||||
import com.tencent.supersonic.headless.api.pojo.request.QueryReq;
|
||||
import com.tencent.supersonic.headless.core.chat.query.llm.s2sql.LLMReq;
|
||||
import com.tencent.supersonic.headless.core.chat.query.llm.s2sql.LLMReq.ElementValue;
|
||||
import com.tencent.supersonic.headless.core.chat.query.llm.s2sql.LLMResp;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@NoArgsConstructor
|
||||
public class ParseResult {
|
||||
|
||||
private Long dataSetId;
|
||||
|
||||
private LLMReq llmReq;
|
||||
|
||||
private LLMResp llmResp;
|
||||
|
||||
private QueryReq request;
|
||||
|
||||
private List<ElementValue> linkingValues;
|
||||
}
|
||||
@@ -0,0 +1,82 @@
|
||||
package com.tencent.supersonic.headless.core.chat.parser.llm;
|
||||
|
||||
|
||||
import com.fasterxml.jackson.core.type.TypeReference;
|
||||
import com.tencent.supersonic.common.config.EmbeddingConfig;
|
||||
import com.tencent.supersonic.common.util.ComponentFactory;
|
||||
import com.tencent.supersonic.common.util.JsonUtil;
|
||||
import com.tencent.supersonic.common.util.embedding.EmbeddingQuery;
|
||||
import com.tencent.supersonic.common.util.embedding.Retrieval;
|
||||
import com.tencent.supersonic.common.util.embedding.RetrieveQuery;
|
||||
import com.tencent.supersonic.common.util.embedding.RetrieveQueryResult;
|
||||
import com.tencent.supersonic.common.util.embedding.S2EmbeddingStore;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.apache.commons.collections4.CollectionUtils;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.core.io.ClassPathResource;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@Slf4j
|
||||
@Component
|
||||
public class SqlExamplarLoader {
|
||||
|
||||
private static final String EXAMPLE_JSON_FILE = "s2ql_examplar.json";
|
||||
|
||||
private S2EmbeddingStore s2EmbeddingStore = ComponentFactory.getS2EmbeddingStore();
|
||||
private TypeReference<List<SqlExample>> valueTypeRef = new TypeReference<List<SqlExample>>() {
|
||||
};
|
||||
|
||||
@Autowired
|
||||
private EmbeddingConfig embeddingConfig;
|
||||
|
||||
public List<SqlExample> getSqlExamples() throws IOException {
|
||||
ClassPathResource resource = new ClassPathResource(EXAMPLE_JSON_FILE);
|
||||
InputStream inputStream = resource.getInputStream();
|
||||
return JsonUtil.INSTANCE.getObjectMapper().readValue(inputStream, valueTypeRef);
|
||||
}
|
||||
|
||||
public void addEmbeddingStore(List<SqlExample> sqlExamples, String collectionName) {
|
||||
List<EmbeddingQuery> queries = new ArrayList<>();
|
||||
for (int i = 0; i < sqlExamples.size(); i++) {
|
||||
SqlExample sqlExample = sqlExamples.get(i);
|
||||
String question = sqlExample.getQuestion();
|
||||
Map<String, Object> metaDataMap = JsonUtil.toMap(JsonUtil.toString(sqlExample), String.class, Object.class);
|
||||
EmbeddingQuery embeddingQuery = new EmbeddingQuery();
|
||||
embeddingQuery.setQueryId(String.valueOf(i));
|
||||
embeddingQuery.setQuery(question);
|
||||
embeddingQuery.setMetadata(metaDataMap);
|
||||
queries.add(embeddingQuery);
|
||||
}
|
||||
s2EmbeddingStore.addQuery(collectionName, queries);
|
||||
}
|
||||
|
||||
public List<Map<String, String>> retrieverSqlExamples(String queryText, int maxResults) {
|
||||
String collectionName = embeddingConfig.getText2sqlCollectionName();
|
||||
RetrieveQuery retrieveQuery = RetrieveQuery.builder().queryTextsList(Collections.singletonList(queryText))
|
||||
.queryEmbeddings(null).build();
|
||||
|
||||
List<RetrieveQueryResult> resultList = s2EmbeddingStore.retrieveQuery(collectionName, retrieveQuery,
|
||||
maxResults);
|
||||
List<Map<String, String>> result = new ArrayList<>();
|
||||
if (CollectionUtils.isEmpty(resultList)) {
|
||||
return result;
|
||||
}
|
||||
for (Retrieval retrieval : resultList.get(0).getRetrieval()) {
|
||||
if (Objects.nonNull(retrieval.getMetadata()) && !retrieval.getMetadata().isEmpty()) {
|
||||
Map<String, String> convertedMap = retrieval.getMetadata().entrySet().stream()
|
||||
.collect(Collectors.toMap(Map.Entry::getKey, entry -> String.valueOf(entry.getValue())));
|
||||
result.add(convertedMap);
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,19 @@
|
||||
package com.tencent.supersonic.headless.core.chat.parser.llm;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
public class SqlExample {
|
||||
|
||||
private String question;
|
||||
|
||||
private String questionAugmented;
|
||||
|
||||
private String dbSchema;
|
||||
|
||||
private String sql;
|
||||
|
||||
private String generatedSchemaLinkingCoT;
|
||||
|
||||
private String generatedSchemaLinkings;
|
||||
}
|
||||
@@ -0,0 +1,19 @@
|
||||
package com.tencent.supersonic.headless.core.chat.parser.llm;
|
||||
|
||||
import com.tencent.supersonic.headless.core.chat.query.llm.s2sql.LLMReq;
|
||||
import com.tencent.supersonic.headless.core.chat.query.llm.s2sql.LLMResp;
|
||||
|
||||
/**
|
||||
* Sql Generation interface, generating SQL using a large model.
|
||||
*/
|
||||
public interface SqlGeneration {
|
||||
|
||||
/***
|
||||
* generate llmResp (sql, schemaLink, prompt, etc.) through LLMReq.
|
||||
* @param llmReq
|
||||
* @param dataSetId
|
||||
* @return
|
||||
*/
|
||||
LLMResp generation(LLMReq llmReq, Long dataSetId);
|
||||
|
||||
}
|
||||
@@ -0,0 +1,21 @@
|
||||
package com.tencent.supersonic.headless.core.chat.parser.llm;
|
||||
|
||||
|
||||
|
||||
import com.tencent.supersonic.headless.core.chat.query.llm.s2sql.LLMReq.SqlGenerationMode;
|
||||
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
|
||||
public class SqlGenerationFactory {
|
||||
|
||||
private static Map<SqlGenerationMode, SqlGeneration> sqlGenerationMap = new ConcurrentHashMap<>();
|
||||
|
||||
public static SqlGeneration get(SqlGenerationMode strategyType) {
|
||||
return sqlGenerationMap.get(strategyType);
|
||||
}
|
||||
|
||||
public static void addSqlGenerationForFactory(SqlGenerationMode strategy, SqlGeneration sqlGeneration) {
|
||||
sqlGenerationMap.put(strategy, sqlGeneration);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,132 @@
|
||||
package com.tencent.supersonic.headless.core.chat.parser.llm;
|
||||
|
||||
import com.tencent.supersonic.headless.core.chat.query.llm.s2sql.LLMReq;
|
||||
import com.tencent.supersonic.headless.core.chat.query.llm.s2sql.LLMReq.ElementValue;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.apache.commons.lang3.tuple.Pair;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
@Component
|
||||
@Slf4j
|
||||
public class SqlPromptGenerator {
|
||||
|
||||
public String generatorLinkingAndSqlPrompt(LLMReq llmReq, List<Map<String, String>> exampleList) {
|
||||
String instruction =
|
||||
"# Find the schema_links for generating SQL queries for each question based on the database schema "
|
||||
+ "and Foreign keys. Then use the the schema links to generate the "
|
||||
+ "SQL queries for each of the questions.";
|
||||
|
||||
List<String> exampleKeys = Arrays.asList("questionAugmented", "dbSchema", "generatedSchemaLinkingCoT", "sql");
|
||||
String exampleTemplate = "dbSchema\nQ: questionAugmented\nA: generatedSchemaLinkingCoT\nSQL: sql";
|
||||
|
||||
String exampleFormat = InputFormat.format(exampleTemplate, exampleKeys, exampleList);
|
||||
|
||||
Pair<String, String> questionPrompt = transformQuestionPrompt(llmReq);
|
||||
String dbSchema = questionPrompt.getLeft();
|
||||
String questionAugmented = questionPrompt.getRight();
|
||||
|
||||
String newCaseTemplate = "%s\nQ: %s\nA: Let’s think step by step. In the question \"%s\", we are asked:";
|
||||
String newCasePrompt = String.format(newCaseTemplate, dbSchema, questionAugmented, questionAugmented);
|
||||
|
||||
return instruction + InputFormat.SEPERATOR + exampleFormat + InputFormat.SEPERATOR + newCasePrompt;
|
||||
}
|
||||
|
||||
public String generateLinkingPrompt(LLMReq llmReq, List<Map<String, String>> exampleList) {
|
||||
String instruction = "# Find the schema_links for generating SQL queries for each question "
|
||||
+ "based on the database schema and Foreign keys.";
|
||||
|
||||
List<String> exampleKeys = Arrays.asList("questionAugmented", "dbSchema", "generatedSchemaLinkingCoT");
|
||||
String exampleTemplate = "dbSchema\nQ: questionAugmented\nA: generatedSchemaLinkingCoT";
|
||||
String exampleFormat = InputFormat.format(exampleTemplate, exampleKeys, exampleList);
|
||||
|
||||
Pair<String, String> questionPrompt = transformQuestionPrompt(llmReq);
|
||||
String dbSchema = questionPrompt.getLeft();
|
||||
String questionAugmented = questionPrompt.getRight();
|
||||
String newCaseTemplate = "%s\nQ: %s\nA: Let’s think step by step. In the question \"%s\", we are asked:";
|
||||
String newCasePrompt = String.format(newCaseTemplate, dbSchema, questionAugmented, questionAugmented);
|
||||
|
||||
return instruction + InputFormat.SEPERATOR + exampleFormat + InputFormat.SEPERATOR + newCasePrompt;
|
||||
}
|
||||
|
||||
public String generateSqlPrompt(LLMReq llmReq, String schemaLinkStr, List<Map<String, String>> fewshotExampleList) {
|
||||
String instruction = "# Use the the schema links to generate the SQL queries for each of the questions.";
|
||||
List<String> exampleKeys = Arrays.asList("questionAugmented", "dbSchema", "generatedSchemaLinkings", "sql");
|
||||
String exampleTemplate = "dbSchema\nQ: questionAugmented\n" + "Schema_links: generatedSchemaLinkings\n"
|
||||
+ "SQL: sql";
|
||||
|
||||
String schemaLinkingPrompt = InputFormat.format(exampleTemplate, exampleKeys, fewshotExampleList);
|
||||
Pair<String, String> questionPrompt = transformQuestionPrompt(llmReq);
|
||||
String dbSchema = questionPrompt.getLeft();
|
||||
String questionAugmented = questionPrompt.getRight();
|
||||
String newCaseTemplate = "%s\nQ: %s\nSchema_links: %s\nSQL: ";
|
||||
String newCasePrompt = String.format(newCaseTemplate, dbSchema, questionAugmented, schemaLinkStr);
|
||||
return instruction + InputFormat.SEPERATOR + schemaLinkingPrompt + InputFormat.SEPERATOR + newCasePrompt;
|
||||
}
|
||||
|
||||
public List<String> generatePromptPool(LLMReq llmReq, List<List<Map<String, String>>> exampleListPool,
|
||||
boolean isSqlPrompt) {
|
||||
List<String> promptPool = new ArrayList<>();
|
||||
for (List<Map<String, String>> exampleList : exampleListPool) {
|
||||
String prompt;
|
||||
if (isSqlPrompt) {
|
||||
prompt = generatorLinkingAndSqlPrompt(llmReq, exampleList);
|
||||
} else {
|
||||
prompt = generateLinkingPrompt(llmReq, exampleList);
|
||||
}
|
||||
promptPool.add(prompt);
|
||||
}
|
||||
return promptPool;
|
||||
}
|
||||
|
||||
public List<List<Map<String, String>>> getExampleCombos(List<Map<String, String>> exampleList, int numFewShots,
|
||||
int numSelfConsistency) {
|
||||
List<List<Map<String, String>>> results = new ArrayList<>();
|
||||
for (int i = 0; i < numSelfConsistency; i++) {
|
||||
List<Map<String, String>> shuffledList = new ArrayList<>(exampleList);
|
||||
Collections.shuffle(shuffledList);
|
||||
results.add(shuffledList.subList(0, numFewShots));
|
||||
}
|
||||
return results;
|
||||
}
|
||||
|
||||
public Pair<String, String> transformQuestionPrompt(LLMReq llmReq) {
|
||||
String modelName = llmReq.getSchema().getDataSetName();
|
||||
List<String> fieldNameList = llmReq.getSchema().getFieldNameList();
|
||||
List<LLMReq.ElementValue> linking = llmReq.getLinking();
|
||||
String currentDate = llmReq.getCurrentDate();
|
||||
String priorExts = llmReq.getPriorExts();
|
||||
|
||||
String dbSchema = "Table: " + modelName + ", Columns = " + fieldNameList + "\nForeign_keys: []";
|
||||
|
||||
List<String> priorLinkingList = new ArrayList<>();
|
||||
for (ElementValue priorLinking : linking) {
|
||||
String fieldName = priorLinking.getFieldName();
|
||||
String fieldValue = priorLinking.getFieldValue();
|
||||
priorLinkingList.add("‘" + fieldValue + "‘是一个‘" + fieldName + "‘");
|
||||
}
|
||||
String currentDataStr = "当前的日期是" + currentDate;
|
||||
String linkingListStr = String.join(",", priorLinkingList);
|
||||
String questionAugmented = String.format("%s (补充信息:%s 。 %s) (备注: %s)", llmReq.getQueryText(), linkingListStr,
|
||||
currentDataStr, priorExts);
|
||||
return Pair.of(dbSchema, questionAugmented);
|
||||
}
|
||||
|
||||
public List<String> generateSqlPromptPool(LLMReq llmReq, List<String> schemaLinkStrPool,
|
||||
List<List<Map<String, String>>> fewshotExampleListPool) {
|
||||
List<String> sqlPromptPool = new ArrayList<>();
|
||||
for (int i = 0; i < schemaLinkStrPool.size(); i++) {
|
||||
String schemaLinkStr = schemaLinkStrPool.get(i);
|
||||
List<Map<String, String>> fewshotExampleList = fewshotExampleListPool.get(i);
|
||||
String sqlPrompt = generateSqlPrompt(llmReq, schemaLinkStr, fewshotExampleList);
|
||||
sqlPromptPool.add(sqlPrompt);
|
||||
}
|
||||
return sqlPromptPool;
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,91 @@
|
||||
package com.tencent.supersonic.headless.core.chat.parser.llm;
|
||||
|
||||
import com.tencent.supersonic.common.util.JsonUtil;
|
||||
import com.tencent.supersonic.headless.core.config.OptimizationConfig;
|
||||
import com.tencent.supersonic.headless.core.chat.query.llm.s2sql.LLMReq;
|
||||
import com.tencent.supersonic.headless.core.chat.query.llm.s2sql.LLMReq.SqlGenerationMode;
|
||||
import com.tencent.supersonic.headless.core.chat.query.llm.s2sql.LLMResp;
|
||||
import dev.langchain4j.data.message.AiMessage;
|
||||
import dev.langchain4j.model.chat.ChatLanguageModel;
|
||||
import dev.langchain4j.model.input.Prompt;
|
||||
import dev.langchain4j.model.input.PromptTemplate;
|
||||
import dev.langchain4j.model.output.Response;
|
||||
import org.apache.commons.lang3.tuple.Pair;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.beans.factory.InitializingBean;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.CopyOnWriteArrayList;
|
||||
|
||||
@Service
|
||||
public class TwoPassSCSqlGeneration implements SqlGeneration, InitializingBean {
|
||||
|
||||
private static final Logger keyPipelineLog = LoggerFactory.getLogger("keyPipeline");
|
||||
@Autowired
|
||||
private ChatLanguageModel chatLanguageModel;
|
||||
|
||||
@Autowired
|
||||
private SqlExamplarLoader sqlExamplarLoader;
|
||||
|
||||
@Autowired
|
||||
private OptimizationConfig optimizationConfig;
|
||||
|
||||
@Autowired
|
||||
private SqlPromptGenerator sqlPromptGenerator;
|
||||
|
||||
@Override
|
||||
public LLMResp generation(LLMReq llmReq, Long dataSetId) {
|
||||
//1.retriever sqlExamples and generate exampleListPool
|
||||
keyPipelineLog.info("dataSetId:{},llmReq:{}", dataSetId, llmReq);
|
||||
List<Map<String, String>> sqlExamples = sqlExamplarLoader.retrieverSqlExamples(llmReq.getQueryText(),
|
||||
optimizationConfig.getText2sqlExampleNum());
|
||||
|
||||
List<List<Map<String, String>>> exampleListPool = sqlPromptGenerator.getExampleCombos(sqlExamples,
|
||||
optimizationConfig.getText2sqlFewShotsNum(), optimizationConfig.getText2sqlSelfConsistencyNum());
|
||||
|
||||
//2.generator linking prompt,and parallel generate response.
|
||||
List<String> linkingPromptPool = sqlPromptGenerator.generatePromptPool(llmReq, exampleListPool, false);
|
||||
List<String> linkingResults = new CopyOnWriteArrayList<>();
|
||||
linkingPromptPool.parallelStream().forEach(
|
||||
linkingPrompt -> {
|
||||
Prompt prompt = PromptTemplate.from(JsonUtil.toString(linkingPrompt)).apply(new HashMap<>());
|
||||
keyPipelineLog.info("step one request prompt:{}", prompt.toSystemMessage());
|
||||
Response<AiMessage> linkingResult = chatLanguageModel.generate(prompt.toSystemMessage());
|
||||
String result = linkingResult.content().text();
|
||||
keyPipelineLog.info("step one model response:{}", result);
|
||||
linkingResults.add(OutputFormat.getSchemaLink(result));
|
||||
}
|
||||
);
|
||||
List<String> sortedList = OutputFormat.formatList(linkingResults);
|
||||
Pair<String, Map<String, Double>> linkingMap = OutputFormat.selfConsistencyVote(sortedList);
|
||||
//3.generator sql prompt,and parallel generate response.
|
||||
List<String> sqlPromptPool = sqlPromptGenerator.generateSqlPromptPool(llmReq, sortedList, exampleListPool);
|
||||
List<String> sqlTaskPool = new CopyOnWriteArrayList<>();
|
||||
sqlPromptPool.parallelStream().forEach(sqlPrompt -> {
|
||||
Prompt linkingPrompt = PromptTemplate.from(JsonUtil.toString(sqlPrompt)).apply(new HashMap<>());
|
||||
keyPipelineLog.info("step two request prompt:{}", linkingPrompt.toSystemMessage());
|
||||
Response<AiMessage> sqlResult = chatLanguageModel.generate(linkingPrompt.toSystemMessage());
|
||||
String result = sqlResult.content().text();
|
||||
keyPipelineLog.info("step two model response:{}", result);
|
||||
sqlTaskPool.add(result);
|
||||
});
|
||||
//4.format response.
|
||||
Pair<String, Map<String, Double>> sqlMapPair = OutputFormat.selfConsistencyVote(sqlTaskPool);
|
||||
keyPipelineLog.info("linkingMap:{} sqlMap:{}", linkingMap, sqlMapPair.getRight());
|
||||
|
||||
LLMResp llmResp = new LLMResp();
|
||||
llmResp.setQuery(llmReq.getQueryText());
|
||||
llmResp.setSqlRespMap(OutputFormat.buildSqlRespMap(sqlExamples, sqlMapPair.getRight()));
|
||||
return llmResp;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void afterPropertiesSet() {
|
||||
SqlGenerationFactory.addSqlGenerationForFactory(SqlGenerationMode.TWO_PASS_AUTO_COT_SELF_CONSISTENCY, this);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,75 @@
|
||||
package com.tencent.supersonic.headless.core.chat.parser.llm;
|
||||
|
||||
import com.tencent.supersonic.common.util.JsonUtil;
|
||||
import com.tencent.supersonic.headless.core.config.OptimizationConfig;
|
||||
import com.tencent.supersonic.headless.core.chat.query.llm.s2sql.LLMReq;
|
||||
import com.tencent.supersonic.headless.core.chat.query.llm.s2sql.LLMReq.SqlGenerationMode;
|
||||
import com.tencent.supersonic.headless.core.chat.query.llm.s2sql.LLMResp;
|
||||
import dev.langchain4j.data.message.AiMessage;
|
||||
import dev.langchain4j.model.chat.ChatLanguageModel;
|
||||
import dev.langchain4j.model.input.Prompt;
|
||||
import dev.langchain4j.model.input.PromptTemplate;
|
||||
import dev.langchain4j.model.output.Response;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.beans.factory.InitializingBean;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
@Service
|
||||
@Slf4j
|
||||
public class TwoPassSqlGeneration implements SqlGeneration, InitializingBean {
|
||||
|
||||
private static final Logger keyPipelineLog = LoggerFactory.getLogger("keyPipeline");
|
||||
@Autowired
|
||||
private ChatLanguageModel chatLanguageModel;
|
||||
|
||||
@Autowired
|
||||
private SqlExamplarLoader sqlExamplarLoader;
|
||||
|
||||
@Autowired
|
||||
private OptimizationConfig optimizationConfig;
|
||||
|
||||
@Autowired
|
||||
private SqlPromptGenerator sqlPromptGenerator;
|
||||
|
||||
@Override
|
||||
public LLMResp generation(LLMReq llmReq, Long dataSetId) {
|
||||
keyPipelineLog.info("dataSetId:{},llmReq:{}", dataSetId, llmReq);
|
||||
List<Map<String, String>> sqlExamples = sqlExamplarLoader.retrieverSqlExamples(llmReq.getQueryText(),
|
||||
optimizationConfig.getText2sqlExampleNum());
|
||||
|
||||
String linkingPromptStr = sqlPromptGenerator.generateLinkingPrompt(llmReq, sqlExamples);
|
||||
|
||||
Prompt prompt = PromptTemplate.from(JsonUtil.toString(linkingPromptStr)).apply(new HashMap<>());
|
||||
keyPipelineLog.info("step one request prompt:{}", prompt.toSystemMessage());
|
||||
Response<AiMessage> response = chatLanguageModel.generate(prompt.toSystemMessage());
|
||||
keyPipelineLog.info("step one model response:{}", response.content().text());
|
||||
String schemaLinkStr = OutputFormat.getSchemaLink(response.content().text());
|
||||
String generateSqlPrompt = sqlPromptGenerator.generateSqlPrompt(llmReq, schemaLinkStr, sqlExamples);
|
||||
|
||||
Prompt sqlPrompt = PromptTemplate.from(JsonUtil.toString(generateSqlPrompt)).apply(new HashMap<>());
|
||||
keyPipelineLog.info("step two request prompt:{}", sqlPrompt.toSystemMessage());
|
||||
Response<AiMessage> sqlResult = chatLanguageModel.generate(sqlPrompt.toSystemMessage());
|
||||
String result = sqlResult.content().text();
|
||||
keyPipelineLog.info("step two model response:{}", result);
|
||||
Map<String, Double> sqlMap = new HashMap<>();
|
||||
sqlMap.put(result, 1D);
|
||||
keyPipelineLog.info("schemaLinkStr:{},sqlMap:{}", schemaLinkStr, sqlMap);
|
||||
|
||||
LLMResp llmResp = new LLMResp();
|
||||
llmResp.setQuery(llmReq.getQueryText());
|
||||
llmResp.setSqlRespMap(OutputFormat.buildSqlRespMap(sqlExamples, sqlMap));
|
||||
return llmResp;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void afterPropertiesSet() {
|
||||
SqlGenerationFactory.addSqlGenerationForFactory(SqlGenerationMode.TWO_PASS_AUTO_COT, this);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,103 @@
|
||||
package com.tencent.supersonic.headless.core.chat.parser.rule;
|
||||
|
||||
import com.tencent.supersonic.common.pojo.enums.AggregateTypeEnum;
|
||||
import com.tencent.supersonic.headless.core.chat.parser.SemanticParser;
|
||||
import com.tencent.supersonic.headless.core.pojo.ChatContext;
|
||||
import com.tencent.supersonic.headless.core.pojo.QueryContext;
|
||||
import com.tencent.supersonic.headless.core.chat.query.SemanticQuery;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import java.util.AbstractMap;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import static com.tencent.supersonic.common.pojo.enums.AggregateTypeEnum.AVG;
|
||||
import static com.tencent.supersonic.common.pojo.enums.AggregateTypeEnum.COUNT;
|
||||
import static com.tencent.supersonic.common.pojo.enums.AggregateTypeEnum.DISTINCT;
|
||||
import static com.tencent.supersonic.common.pojo.enums.AggregateTypeEnum.MAX;
|
||||
import static com.tencent.supersonic.common.pojo.enums.AggregateTypeEnum.MIN;
|
||||
import static com.tencent.supersonic.common.pojo.enums.AggregateTypeEnum.NONE;
|
||||
import static com.tencent.supersonic.common.pojo.enums.AggregateTypeEnum.SUM;
|
||||
import static com.tencent.supersonic.common.pojo.enums.AggregateTypeEnum.TOPN;
|
||||
|
||||
/**
|
||||
* AggregateTypeParser extracts aggregation type specified in the user query
|
||||
* based on keyword matching.
|
||||
* Currently, it supports 7 types of aggregation: max, min, sum, avg, topN,
|
||||
* distinct count, count.
|
||||
*/
|
||||
@Slf4j
|
||||
public class AggregateTypeParser implements SemanticParser {
|
||||
|
||||
private static final Map<AggregateTypeEnum, Pattern> REGX_MAP = Stream.of(
|
||||
new AbstractMap.SimpleEntry<>(MAX, Pattern.compile("(?i)(最大值|最大|max|峰值|最高|最多)")),
|
||||
new AbstractMap.SimpleEntry<>(MIN, Pattern.compile("(?i)(最小值|最小|min|最低|最少)")),
|
||||
new AbstractMap.SimpleEntry<>(SUM, Pattern.compile("(?i)(汇总|总和|sum)")),
|
||||
new AbstractMap.SimpleEntry<>(AVG, Pattern.compile("(?i)(平均值|日均|平均|avg)")),
|
||||
new AbstractMap.SimpleEntry<>(TOPN, Pattern.compile("(?i)(top)")),
|
||||
new AbstractMap.SimpleEntry<>(DISTINCT, Pattern.compile("(?i)(uv)")),
|
||||
new AbstractMap.SimpleEntry<>(COUNT, Pattern.compile("(?i)(总数|pv)")),
|
||||
new AbstractMap.SimpleEntry<>(NONE, Pattern.compile("(?i)(明细)"))
|
||||
).collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue, (k1, k2) -> k2));
|
||||
|
||||
@Override
|
||||
public void parse(QueryContext queryContext, ChatContext chatContext) {
|
||||
String queryText = queryContext.getQueryText();
|
||||
AggregateConf aggregateConf = resolveAggregateConf(queryText);
|
||||
|
||||
for (SemanticQuery semanticQuery : queryContext.getCandidateQueries()) {
|
||||
if (!AggregateTypeEnum.NONE.equals(semanticQuery.getParseInfo().getAggType())) {
|
||||
continue;
|
||||
}
|
||||
semanticQuery.getParseInfo().setAggType(aggregateConf.type);
|
||||
int detectWordLength = 0;
|
||||
if (StringUtils.isNotEmpty(aggregateConf.detectWord)) {
|
||||
detectWordLength = aggregateConf.detectWord.length();
|
||||
}
|
||||
semanticQuery.getParseInfo().setScore(semanticQuery.getParseInfo().getScore() + detectWordLength);
|
||||
}
|
||||
}
|
||||
|
||||
public AggregateTypeEnum resolveAggregateType(String queryText) {
|
||||
AggregateConf aggregateConf = resolveAggregateConf(queryText);
|
||||
return aggregateConf.type;
|
||||
}
|
||||
|
||||
private AggregateConf resolveAggregateConf(String queryText) {
|
||||
Map<AggregateTypeEnum, Integer> aggregateCount = new HashMap<>(REGX_MAP.size());
|
||||
Map<AggregateTypeEnum, String> aggregateWord = new HashMap<>(REGX_MAP.size());
|
||||
|
||||
|
||||
for (Map.Entry<AggregateTypeEnum, Pattern> entry : REGX_MAP.entrySet()) {
|
||||
Matcher matcher = entry.getValue().matcher(queryText);
|
||||
int count = 0;
|
||||
String detectWord = null;
|
||||
while (matcher.find()) {
|
||||
count++;
|
||||
detectWord = matcher.group();
|
||||
}
|
||||
if (count > 0) {
|
||||
aggregateCount.put(entry.getKey(), count);
|
||||
aggregateWord.put(entry.getKey(), detectWord);
|
||||
}
|
||||
}
|
||||
|
||||
AggregateTypeEnum type = aggregateCount.entrySet().stream().max(Map.Entry.comparingByValue())
|
||||
.map(entry -> entry.getKey()).orElse(NONE);
|
||||
String detectWord = aggregateWord.get(type);
|
||||
return new AggregateConf(type, detectWord);
|
||||
}
|
||||
|
||||
@AllArgsConstructor
|
||||
class AggregateConf {
|
||||
public AggregateTypeEnum type;
|
||||
public String detectWord;
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,124 @@
|
||||
package com.tencent.supersonic.headless.core.chat.parser.rule;
|
||||
|
||||
import com.tencent.supersonic.headless.api.pojo.SchemaElementMatch;
|
||||
import com.tencent.supersonic.headless.api.pojo.SchemaElementType;
|
||||
import com.tencent.supersonic.headless.core.chat.parser.SemanticParser;
|
||||
import com.tencent.supersonic.headless.core.pojo.ChatContext;
|
||||
import com.tencent.supersonic.headless.core.pojo.QueryContext;
|
||||
import com.tencent.supersonic.headless.core.chat.query.QueryManager;
|
||||
import com.tencent.supersonic.headless.core.chat.query.SemanticQuery;
|
||||
import com.tencent.supersonic.headless.core.chat.query.rule.RuleSemanticQuery;
|
||||
import com.tencent.supersonic.headless.core.chat.query.rule.metric.MetricModelQuery;
|
||||
import com.tencent.supersonic.headless.core.chat.query.rule.metric.MetricSemanticQuery;
|
||||
import com.tencent.supersonic.headless.core.chat.query.rule.metric.MetricTagQuery;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
import java.util.AbstractMap;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
/**
|
||||
* ContextInheritParser tries to inherit certain schema elements from context
|
||||
* so that in multi-turn conversations users don't need to mention some keyword
|
||||
* repeatedly.
|
||||
*/
|
||||
@Slf4j
|
||||
public class ContextInheritParser implements SemanticParser {
|
||||
|
||||
private static final Map<SchemaElementType, List<SchemaElementType>> MUTUAL_EXCLUSIVE_MAP = Stream.of(
|
||||
new AbstractMap.SimpleEntry<>(SchemaElementType.METRIC, Arrays.asList(SchemaElementType.METRIC)),
|
||||
new AbstractMap.SimpleEntry<>(
|
||||
SchemaElementType.DIMENSION, Arrays.asList(SchemaElementType.DIMENSION, SchemaElementType.VALUE)),
|
||||
new AbstractMap.SimpleEntry<>(
|
||||
SchemaElementType.VALUE, Arrays.asList(SchemaElementType.VALUE, SchemaElementType.DIMENSION)),
|
||||
new AbstractMap.SimpleEntry<>(SchemaElementType.ENTITY, Arrays.asList(SchemaElementType.ENTITY)),
|
||||
new AbstractMap.SimpleEntry<>(SchemaElementType.TAG, Arrays.asList(SchemaElementType.TAG)),
|
||||
new AbstractMap.SimpleEntry<>(SchemaElementType.DATASET, Arrays.asList(SchemaElementType.DATASET)),
|
||||
new AbstractMap.SimpleEntry<>(SchemaElementType.ID, Arrays.asList(SchemaElementType.ID))
|
||||
).collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));
|
||||
|
||||
@Override
|
||||
public void parse(QueryContext queryContext, ChatContext chatContext) {
|
||||
if (!shouldInherit(queryContext)) {
|
||||
return;
|
||||
}
|
||||
Long dataSetId = getMatchedDataSet(queryContext, chatContext);
|
||||
if (dataSetId == null) {
|
||||
return;
|
||||
}
|
||||
|
||||
List<SchemaElementMatch> elementMatches = queryContext.getMapInfo().getMatchedElements(dataSetId);
|
||||
|
||||
List<SchemaElementMatch> matchesToInherit = new ArrayList<>();
|
||||
for (SchemaElementMatch match : chatContext.getParseInfo().getElementMatches()) {
|
||||
SchemaElementType matchType = match.getElement().getType();
|
||||
// mutual exclusive element types should not be inherited
|
||||
RuleSemanticQuery ruleQuery = QueryManager.getRuleQuery(chatContext.getParseInfo().getQueryMode());
|
||||
if (!containsTypes(elementMatches, matchType, ruleQuery)) {
|
||||
match.setInherited(true);
|
||||
matchesToInherit.add(match);
|
||||
}
|
||||
}
|
||||
elementMatches.addAll(matchesToInherit);
|
||||
|
||||
List<RuleSemanticQuery> queries = RuleSemanticQuery.resolve(elementMatches, queryContext);
|
||||
for (RuleSemanticQuery query : queries) {
|
||||
query.fillParseInfo(queryContext, chatContext);
|
||||
if (existSameQuery(query.getParseInfo().getDataSetId(), query.getQueryMode(), queryContext)) {
|
||||
continue;
|
||||
}
|
||||
queryContext.getCandidateQueries().add(query);
|
||||
}
|
||||
}
|
||||
|
||||
private boolean existSameQuery(Long dataSetId, String queryMode, QueryContext queryContext) {
|
||||
for (SemanticQuery semanticQuery : queryContext.getCandidateQueries()) {
|
||||
if (semanticQuery.getQueryMode().equals(queryMode)
|
||||
&& semanticQuery.getParseInfo().getDataSetId().equals(dataSetId)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
private boolean containsTypes(List<SchemaElementMatch> matches, SchemaElementType matchType,
|
||||
RuleSemanticQuery ruleQuery) {
|
||||
List<SchemaElementType> types = MUTUAL_EXCLUSIVE_MAP.get(matchType);
|
||||
|
||||
return matches.stream().anyMatch(m -> {
|
||||
SchemaElementType type = m.getElement().getType();
|
||||
if (Objects.nonNull(ruleQuery) && ruleQuery instanceof MetricSemanticQuery
|
||||
&& !(ruleQuery instanceof MetricTagQuery)) {
|
||||
return types.contains(type);
|
||||
}
|
||||
return type.equals(matchType);
|
||||
});
|
||||
}
|
||||
|
||||
protected boolean shouldInherit(QueryContext queryContext) {
|
||||
// if candidates only have MetricModel mode, count in context
|
||||
List<SemanticQuery> metricModelQueries = queryContext.getCandidateQueries().stream()
|
||||
.filter(query -> query instanceof MetricModelQuery).collect(
|
||||
Collectors.toList());
|
||||
return metricModelQueries.size() == queryContext.getCandidateQueries().size();
|
||||
}
|
||||
|
||||
protected Long getMatchedDataSet(QueryContext queryContext, ChatContext chatContext) {
|
||||
Long dataSetId = chatContext.getParseInfo().getDataSetId();
|
||||
if (dataSetId == null) {
|
||||
return null;
|
||||
}
|
||||
Set<Long> queryDataSets = queryContext.getMapInfo().getMatchedDataSetInfos();
|
||||
if (queryDataSets.contains(dataSetId)) {
|
||||
return dataSetId;
|
||||
}
|
||||
return dataSetId;
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,41 @@
|
||||
package com.tencent.supersonic.headless.core.chat.parser.rule;
|
||||
|
||||
import com.tencent.supersonic.headless.api.pojo.SchemaElementMatch;
|
||||
import com.tencent.supersonic.headless.api.pojo.SchemaMapInfo;
|
||||
import com.tencent.supersonic.headless.core.chat.parser.SemanticParser;
|
||||
import com.tencent.supersonic.headless.core.pojo.ChatContext;
|
||||
import com.tencent.supersonic.headless.core.pojo.QueryContext;
|
||||
import com.tencent.supersonic.headless.core.chat.query.rule.RuleSemanticQuery;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* RuleSqlParser resolves a specific SemanticQuery according to co-appearance
|
||||
* of certain schema element types.
|
||||
*/
|
||||
@Slf4j
|
||||
public class RuleSqlParser implements SemanticParser {
|
||||
|
||||
private static List<SemanticParser> auxiliaryParsers = Arrays.asList(
|
||||
new ContextInheritParser(),
|
||||
new TimeRangeParser(),
|
||||
new AggregateTypeParser()
|
||||
);
|
||||
|
||||
@Override
|
||||
public void parse(QueryContext queryContext, ChatContext chatContext) {
|
||||
SchemaMapInfo mapInfo = queryContext.getMapInfo();
|
||||
// iterate all schemaElementMatches to resolve query mode
|
||||
for (Long dataSetId : mapInfo.getMatchedDataSetInfos()) {
|
||||
List<SchemaElementMatch> elementMatches = mapInfo.getMatchedElements(dataSetId);
|
||||
List<RuleSemanticQuery> queries = RuleSemanticQuery.resolve(elementMatches, queryContext);
|
||||
for (RuleSemanticQuery query : queries) {
|
||||
query.fillParseInfo(queryContext, chatContext);
|
||||
queryContext.getCandidateQueries().add(query);
|
||||
}
|
||||
}
|
||||
|
||||
auxiliaryParsers.stream().forEach(p -> p.parse(queryContext, chatContext));
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,210 @@
|
||||
package com.tencent.supersonic.headless.core.chat.parser.rule;
|
||||
|
||||
import com.tencent.supersonic.common.pojo.Constants;
|
||||
import com.tencent.supersonic.common.pojo.DateConf;
|
||||
import com.tencent.supersonic.headless.core.chat.parser.SemanticParser;
|
||||
import com.tencent.supersonic.headless.core.pojo.ChatContext;
|
||||
import com.tencent.supersonic.headless.core.pojo.QueryContext;
|
||||
import com.tencent.supersonic.headless.core.chat.query.QueryManager;
|
||||
import com.tencent.supersonic.headless.core.chat.query.SemanticQuery;
|
||||
import com.tencent.supersonic.headless.core.chat.query.rule.RuleSemanticQuery;
|
||||
import com.xkzhangsan.time.nlp.TimeNLP;
|
||||
import com.xkzhangsan.time.nlp.TimeNLPUtil;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.apache.logging.log4j.util.Strings;
|
||||
|
||||
import java.text.DateFormat;
|
||||
import java.text.ParseException;
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.time.LocalDate;
|
||||
import java.util.Date;
|
||||
import java.util.List;
|
||||
import java.util.Stack;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
/**
|
||||
* TimeRangeParser extracts time range specified in the user query
|
||||
* based on keyword matching.
|
||||
* Currently, it supports two kinds of expression:
|
||||
* 1. Recent unit: 近N天/周/月/年、过去N天/周/月/年
|
||||
* 2. Concrete date: 2023年11月15日、20231115
|
||||
*/
|
||||
@Slf4j
|
||||
public class TimeRangeParser implements SemanticParser {
|
||||
|
||||
private static final Pattern RECENT_PATTERN_CN = Pattern.compile(
|
||||
".*(?<periodStr>(近|过去)((?<enNum>\\d+)|(?<zhNum>[一二三四五六七八九十百千万亿]+))个?(?<zhPeriod>[天周月年])).*");
|
||||
private static final Pattern DATE_PATTERN_NUMBER = Pattern.compile("(\\d{8})");
|
||||
private static final DateFormat DATE_FORMAT_NUMBER = new SimpleDateFormat("yyyyMMdd");
|
||||
private static final DateFormat DATE_FORMAT = new SimpleDateFormat("yyyy-MM-dd");
|
||||
|
||||
@Override
|
||||
public void parse(QueryContext queryContext, ChatContext chatContext) {
|
||||
String queryText = queryContext.getQueryText();
|
||||
DateConf dateConf = parseRecent(queryText);
|
||||
if (dateConf == null) {
|
||||
dateConf = parseDateNumber(queryText);
|
||||
}
|
||||
if (dateConf == null) {
|
||||
dateConf = parseDateCN(queryText);
|
||||
}
|
||||
|
||||
if (dateConf != null) {
|
||||
if (queryContext.getCandidateQueries().size() > 0) {
|
||||
for (SemanticQuery query : queryContext.getCandidateQueries()) {
|
||||
query.getParseInfo().setDateInfo(dateConf);
|
||||
query.getParseInfo().setScore(query.getParseInfo().getScore()
|
||||
+ dateConf.getDetectWord().length());
|
||||
}
|
||||
} else if (QueryManager.containsRuleQuery(chatContext.getParseInfo().getQueryMode())) {
|
||||
RuleSemanticQuery semanticQuery = QueryManager.createRuleQuery(
|
||||
chatContext.getParseInfo().getQueryMode());
|
||||
// inherit parse info from context
|
||||
chatContext.getParseInfo().setDateInfo(dateConf);
|
||||
chatContext.getParseInfo().setScore(chatContext.getParseInfo().getScore()
|
||||
+ dateConf.getDetectWord().length());
|
||||
semanticQuery.setParseInfo(chatContext.getParseInfo());
|
||||
queryContext.getCandidateQueries().add(semanticQuery);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private DateConf parseDateCN(String queryText) {
|
||||
Date startDate = null;
|
||||
Date endDate;
|
||||
String detectWord = null;
|
||||
|
||||
List<TimeNLP> times = TimeNLPUtil.parse(queryText);
|
||||
if (times.size() > 0) {
|
||||
startDate = times.get(0).getTime();
|
||||
detectWord = times.get(0).getTimeExpression();
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (times.size() > 1) {
|
||||
endDate = times.get(1).getTime();
|
||||
detectWord += "~" + times.get(0).getTimeExpression();
|
||||
} else {
|
||||
endDate = startDate;
|
||||
}
|
||||
|
||||
return getDateConf(startDate, endDate, detectWord);
|
||||
}
|
||||
|
||||
private DateConf parseDateNumber(String queryText) {
|
||||
String startDate;
|
||||
String endDate = null;
|
||||
String detectWord = null;
|
||||
|
||||
Matcher dateMatcher = DATE_PATTERN_NUMBER.matcher(queryText);
|
||||
if (dateMatcher.find()) {
|
||||
startDate = dateMatcher.group();
|
||||
detectWord = startDate;
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (dateMatcher.find()) {
|
||||
endDate = dateMatcher.group();
|
||||
detectWord += "~" + endDate;
|
||||
}
|
||||
|
||||
endDate = endDate != null ? endDate : startDate;
|
||||
|
||||
try {
|
||||
return getDateConf(DATE_FORMAT_NUMBER.parse(startDate), DATE_FORMAT_NUMBER.parse(endDate), detectWord);
|
||||
} catch (ParseException e) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private DateConf parseRecent(String queryText) {
|
||||
Matcher m = RECENT_PATTERN_CN.matcher(queryText);
|
||||
if (m.matches()) {
|
||||
int num = 0;
|
||||
String enNum = m.group("enNum");
|
||||
String zhNum = m.group("zhNum");
|
||||
if (enNum != null) {
|
||||
num = Integer.parseInt(enNum);
|
||||
} else if (zhNum != null) {
|
||||
num = zhNumParse(zhNum);
|
||||
}
|
||||
if (num > 0) {
|
||||
DateConf info = new DateConf();
|
||||
String zhPeriod = m.group("zhPeriod");
|
||||
int days;
|
||||
switch (zhPeriod) {
|
||||
case "周":
|
||||
days = 7;
|
||||
info.setPeriod(Constants.WEEK);
|
||||
break;
|
||||
case "月":
|
||||
days = 30;
|
||||
info.setPeriod(Constants.MONTH);
|
||||
break;
|
||||
case "年":
|
||||
days = 365;
|
||||
info.setPeriod(Constants.YEAR);
|
||||
break;
|
||||
default:
|
||||
days = 1;
|
||||
info.setPeriod(Constants.DAY);
|
||||
}
|
||||
days = days * num;
|
||||
info.setDateMode(DateConf.DateMode.RECENT);
|
||||
String detectWord = "近" + num + zhPeriod;
|
||||
if (Strings.isNotEmpty(m.group("periodStr"))) {
|
||||
detectWord = m.group("periodStr");
|
||||
}
|
||||
info.setDetectWord(detectWord);
|
||||
info.setStartDate(LocalDate.now().minusDays(days).toString());
|
||||
info.setEndDate(LocalDate.now().minusDays(1).toString());
|
||||
info.setUnit(num);
|
||||
|
||||
return info;
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private int zhNumParse(String zhNumStr) {
|
||||
Stack<Integer> stack = new Stack<>();
|
||||
String numStr = "一二三四五六七八九";
|
||||
String unitStr = "十百千万亿";
|
||||
|
||||
String[] ssArr = zhNumStr.split("");
|
||||
for (String e : ssArr) {
|
||||
int numIndex = numStr.indexOf(e);
|
||||
int unitIndex = unitStr.indexOf(e);
|
||||
if (numIndex != -1) {
|
||||
stack.push(numIndex + 1);
|
||||
} else if (unitIndex != -1) {
|
||||
int unitNum = (int) Math.pow(10, unitIndex + 1);
|
||||
if (stack.isEmpty()) {
|
||||
stack.push(unitNum);
|
||||
} else {
|
||||
stack.push(stack.pop() * unitNum);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return stack.stream().mapToInt(s -> s).sum();
|
||||
}
|
||||
|
||||
private DateConf getDateConf(Date startDate, Date endDate, String detectWord) {
|
||||
if (startDate == null || endDate == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
DateConf info = new DateConf();
|
||||
info.setDateMode(DateConf.DateMode.BETWEEN);
|
||||
info.setStartDate(DATE_FORMAT.format(startDate));
|
||||
info.setEndDate(DATE_FORMAT.format(endDate));
|
||||
info.setDetectWord(detectWord);
|
||||
return info;
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,87 @@
|
||||
|
||||
package com.tencent.supersonic.headless.core.chat.query;
|
||||
|
||||
import com.tencent.supersonic.common.pojo.Aggregator;
|
||||
import com.tencent.supersonic.common.pojo.Filter;
|
||||
import com.tencent.supersonic.common.pojo.Order;
|
||||
import com.tencent.supersonic.common.pojo.enums.TimeDimensionEnum;
|
||||
import com.tencent.supersonic.common.util.ContextUtils;
|
||||
import com.tencent.supersonic.headless.api.pojo.SemanticParseInfo;
|
||||
import com.tencent.supersonic.headless.api.pojo.SemanticSchema;
|
||||
import com.tencent.supersonic.headless.api.pojo.request.QuerySqlReq;
|
||||
import com.tencent.supersonic.headless.api.pojo.request.QueryStructReq;
|
||||
import com.tencent.supersonic.headless.core.config.OptimizationConfig;
|
||||
import com.tencent.supersonic.headless.core.utils.QueryReqBuilder;
|
||||
import lombok.ToString;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.apache.commons.collections4.CollectionUtils;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@Slf4j
|
||||
@ToString
|
||||
public abstract class BaseSemanticQuery implements SemanticQuery, Serializable {
|
||||
|
||||
protected SemanticParseInfo parseInfo = new SemanticParseInfo();
|
||||
|
||||
@Override
|
||||
public SemanticParseInfo getParseInfo() {
|
||||
return parseInfo;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setParseInfo(SemanticParseInfo parseInfo) {
|
||||
this.parseInfo = parseInfo;
|
||||
}
|
||||
|
||||
protected QueryStructReq convertQueryStruct() {
|
||||
return QueryReqBuilder.buildStructReq(parseInfo);
|
||||
}
|
||||
|
||||
protected void convertBizNameToName(SemanticSchema semanticSchema, QueryStructReq queryStructReq) {
|
||||
Map<String, String> bizNameToName = semanticSchema.getBizNameToName(queryStructReq.getDataSetId());
|
||||
bizNameToName.putAll(TimeDimensionEnum.getNameToNameMap());
|
||||
|
||||
List<Order> orders = queryStructReq.getOrders();
|
||||
if (CollectionUtils.isNotEmpty(orders)) {
|
||||
for (Order order : orders) {
|
||||
order.setColumn(bizNameToName.get(order.getColumn()));
|
||||
}
|
||||
}
|
||||
List<Aggregator> aggregators = queryStructReq.getAggregators();
|
||||
if (CollectionUtils.isNotEmpty(aggregators)) {
|
||||
for (Aggregator aggregator : aggregators) {
|
||||
aggregator.setColumn(bizNameToName.get(aggregator.getColumn()));
|
||||
}
|
||||
}
|
||||
List<String> groups = queryStructReq.getGroups();
|
||||
if (CollectionUtils.isNotEmpty(groups)) {
|
||||
groups = groups.stream().map(bizNameToName::get).collect(Collectors.toList());
|
||||
queryStructReq.setGroups(groups);
|
||||
}
|
||||
List<Filter> dimensionFilters = queryStructReq.getDimensionFilters();
|
||||
if (CollectionUtils.isNotEmpty(dimensionFilters)) {
|
||||
dimensionFilters.forEach(filter -> filter.setName(bizNameToName.get(filter.getBizName())));
|
||||
}
|
||||
List<Filter> metricFilters = queryStructReq.getMetricFilters();
|
||||
if (CollectionUtils.isNotEmpty(dimensionFilters)) {
|
||||
metricFilters.forEach(filter -> filter.setName(bizNameToName.get(filter.getBizName())));
|
||||
}
|
||||
}
|
||||
|
||||
protected void initS2SqlByStruct(SemanticSchema semanticSchema) {
|
||||
OptimizationConfig optimizationConfig = ContextUtils.getBean(OptimizationConfig.class);
|
||||
if (!optimizationConfig.isUseS2SqlSwitch()) {
|
||||
return;
|
||||
}
|
||||
QueryStructReq queryStructReq = convertQueryStruct();
|
||||
convertBizNameToName(semanticSchema, queryStructReq);
|
||||
QuerySqlReq querySQLReq = queryStructReq.convert();
|
||||
parseInfo.getSqlInfo().setS2SQL(querySQLReq.getSql());
|
||||
parseInfo.getSqlInfo().setCorrectS2SQL(querySQLReq.getSql());
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,87 @@
|
||||
package com.tencent.supersonic.headless.core.chat.query;
|
||||
|
||||
import com.tencent.supersonic.headless.core.chat.query.llm.LLMSemanticQuery;
|
||||
import com.tencent.supersonic.headless.core.chat.query.rule.RuleSemanticQuery;
|
||||
import com.tencent.supersonic.headless.core.chat.query.rule.metric.MetricSemanticQuery;
|
||||
import com.tencent.supersonic.headless.core.chat.query.rule.tag.TagSemanticQuery;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
|
||||
public class QueryManager {
|
||||
|
||||
private static Map<String, RuleSemanticQuery> ruleQueryMap = new ConcurrentHashMap<>();
|
||||
private static Map<String, LLMSemanticQuery> llmQueryMap = new ConcurrentHashMap<>();
|
||||
|
||||
public static void register(SemanticQuery query) {
|
||||
if (query instanceof RuleSemanticQuery) {
|
||||
ruleQueryMap.put(query.getQueryMode(), (RuleSemanticQuery) query);
|
||||
} else if (query instanceof LLMSemanticQuery) {
|
||||
llmQueryMap.put(query.getQueryMode(), (LLMSemanticQuery) query);
|
||||
}
|
||||
}
|
||||
|
||||
public static SemanticQuery createQuery(String queryMode) {
|
||||
if (containsRuleQuery(queryMode)) {
|
||||
return createRuleQuery(queryMode);
|
||||
}
|
||||
return createLLMQuery(queryMode);
|
||||
|
||||
}
|
||||
|
||||
public static RuleSemanticQuery createRuleQuery(String queryMode) {
|
||||
RuleSemanticQuery semanticQuery = ruleQueryMap.get(queryMode);
|
||||
return (RuleSemanticQuery) getSemanticQuery(queryMode, semanticQuery);
|
||||
}
|
||||
|
||||
public static LLMSemanticQuery createLLMQuery(String queryMode) {
|
||||
LLMSemanticQuery semanticQuery = llmQueryMap.get(queryMode);
|
||||
return (LLMSemanticQuery) getSemanticQuery(queryMode, semanticQuery);
|
||||
}
|
||||
|
||||
private static SemanticQuery getSemanticQuery(String queryMode, SemanticQuery semanticQuery) {
|
||||
if (Objects.isNull(semanticQuery)) {
|
||||
throw new RuntimeException("no supported queryMode :" + queryMode);
|
||||
}
|
||||
try {
|
||||
return semanticQuery.getClass().getDeclaredConstructor().newInstance();
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException("no supported queryMode :" + queryMode);
|
||||
}
|
||||
}
|
||||
|
||||
public static boolean containsRuleQuery(String queryMode) {
|
||||
if (queryMode == null) {
|
||||
return false;
|
||||
}
|
||||
return ruleQueryMap.containsKey(queryMode);
|
||||
}
|
||||
|
||||
public static boolean isMetricQuery(String queryMode) {
|
||||
if (queryMode == null || !ruleQueryMap.containsKey(queryMode)) {
|
||||
return false;
|
||||
}
|
||||
return ruleQueryMap.get(queryMode) instanceof MetricSemanticQuery;
|
||||
}
|
||||
|
||||
public static boolean isTagQuery(String queryMode) {
|
||||
if (queryMode == null || !ruleQueryMap.containsKey(queryMode)) {
|
||||
return false;
|
||||
}
|
||||
return ruleQueryMap.get(queryMode) instanceof TagSemanticQuery;
|
||||
}
|
||||
|
||||
public static RuleSemanticQuery getRuleQuery(String queryMode) {
|
||||
if (queryMode == null) {
|
||||
return null;
|
||||
}
|
||||
return ruleQueryMap.get(queryMode);
|
||||
}
|
||||
|
||||
public static List<RuleSemanticQuery> getRuleQueries() {
|
||||
return new ArrayList<>(ruleQueryMap.values());
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,23 @@
|
||||
package com.tencent.supersonic.headless.core.chat.query;
|
||||
|
||||
import com.tencent.supersonic.auth.api.authentication.pojo.User;
|
||||
import com.tencent.supersonic.headless.api.pojo.SemanticParseInfo;
|
||||
import com.tencent.supersonic.headless.api.pojo.SemanticSchema;
|
||||
import com.tencent.supersonic.headless.api.pojo.request.SemanticQueryReq;
|
||||
import org.apache.calcite.sql.parser.SqlParseException;
|
||||
|
||||
/**
|
||||
* A semantic query executes specific type of query based on the results of semantic parsing.
|
||||
*/
|
||||
public interface SemanticQuery {
|
||||
|
||||
String getQueryMode();
|
||||
|
||||
SemanticQueryReq buildSemanticQueryReq() throws SqlParseException;
|
||||
|
||||
void initS2Sql(SemanticSchema semanticSchema, User user);
|
||||
|
||||
SemanticParseInfo getParseInfo();
|
||||
|
||||
void setParseInfo(SemanticParseInfo parseInfo);
|
||||
}
|
||||
@@ -0,0 +1,8 @@
|
||||
package com.tencent.supersonic.headless.core.chat.query.llm;
|
||||
|
||||
import com.tencent.supersonic.headless.core.chat.query.BaseSemanticQuery;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
public abstract class LLMSemanticQuery extends BaseSemanticQuery {
|
||||
}
|
||||
@@ -0,0 +1,83 @@
|
||||
package com.tencent.supersonic.headless.core.chat.query.llm.s2sql;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonValue;
|
||||
import lombok.Data;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
@Data
|
||||
public class LLMReq {
|
||||
|
||||
private String queryText;
|
||||
|
||||
private FilterCondition filterCondition;
|
||||
|
||||
private LLMSchema schema;
|
||||
|
||||
private List<ElementValue> linking;
|
||||
|
||||
private String currentDate;
|
||||
|
||||
private String priorExts;
|
||||
|
||||
private String sqlGenerationMode;
|
||||
|
||||
@Data
|
||||
public static class ElementValue {
|
||||
|
||||
private String fieldName;
|
||||
|
||||
private String fieldValue;
|
||||
|
||||
}
|
||||
|
||||
@Data
|
||||
public static class LLMSchema {
|
||||
|
||||
private String domainName;
|
||||
|
||||
private String dataSetName;
|
||||
|
||||
private List<String> fieldNameList;
|
||||
|
||||
}
|
||||
|
||||
@Data
|
||||
public static class FilterCondition {
|
||||
|
||||
private String tableName;
|
||||
}
|
||||
|
||||
public enum SqlGenerationMode {
|
||||
|
||||
ONE_PASS_AUTO_COT("1_pass_auto_cot"),
|
||||
|
||||
ONE_PASS_AUTO_COT_SELF_CONSISTENCY("1_pass_auto_cot_self_consistency"),
|
||||
|
||||
TWO_PASS_AUTO_COT("2_pass_auto_cot"),
|
||||
|
||||
TWO_PASS_AUTO_COT_SELF_CONSISTENCY("2_pass_auto_cot_self_consistency");
|
||||
|
||||
|
||||
private String name;
|
||||
|
||||
SqlGenerationMode(String name) {
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
@JsonValue
|
||||
public String getName() {
|
||||
return name;
|
||||
}
|
||||
|
||||
public static SqlGenerationMode getMode(String name) {
|
||||
for (SqlGenerationMode sqlGenerationMode : SqlGenerationMode.values()) {
|
||||
if (sqlGenerationMode.name.equals(name)) {
|
||||
return sqlGenerationMode;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,26 @@
|
||||
package com.tencent.supersonic.headless.core.chat.query.llm.s2sql;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
@Data
|
||||
public class LLMResp {
|
||||
|
||||
private String query;
|
||||
|
||||
private String modelName;
|
||||
|
||||
private String sqlOutput;
|
||||
|
||||
private List<String> fields;
|
||||
|
||||
private Map<String, LLMSqlResp> sqlRespMap;
|
||||
|
||||
/**
|
||||
* Only for compatibility with python code, later deleted
|
||||
*/
|
||||
private Map<String, Double> sqlWeight;
|
||||
|
||||
}
|
||||
@@ -0,0 +1,40 @@
|
||||
package com.tencent.supersonic.headless.core.chat.query.llm.s2sql;
|
||||
|
||||
import com.tencent.supersonic.auth.api.authentication.pojo.User;
|
||||
import com.tencent.supersonic.headless.api.pojo.SemanticSchema;
|
||||
import com.tencent.supersonic.headless.api.pojo.SqlInfo;
|
||||
import com.tencent.supersonic.headless.api.pojo.request.SemanticQueryReq;
|
||||
import com.tencent.supersonic.headless.core.chat.query.QueryManager;
|
||||
import com.tencent.supersonic.headless.core.chat.query.llm.LLMSemanticQuery;
|
||||
import com.tencent.supersonic.headless.core.utils.QueryReqBuilder;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
@Slf4j
|
||||
@Component
|
||||
public class LLMSqlQuery extends LLMSemanticQuery {
|
||||
|
||||
public static final String QUERY_MODE = "LLM_S2SQL";
|
||||
|
||||
public LLMSqlQuery() {
|
||||
QueryManager.register(this);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getQueryMode() {
|
||||
return QUERY_MODE;
|
||||
}
|
||||
|
||||
@Override
|
||||
public SemanticQueryReq buildSemanticQueryReq() {
|
||||
|
||||
String querySql = parseInfo.getSqlInfo().getCorrectS2SQL();
|
||||
return QueryReqBuilder.buildS2SQLReq(querySql, parseInfo.getDataSetId());
|
||||
}
|
||||
|
||||
@Override
|
||||
public void initS2Sql(SemanticSchema semanticSchema, User user) {
|
||||
SqlInfo sqlInfo = parseInfo.getSqlInfo();
|
||||
sqlInfo.setCorrectS2SQL(sqlInfo.getS2SQL());
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,22 @@
|
||||
package com.tencent.supersonic.headless.core.chat.query.llm.s2sql;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@NoArgsConstructor
|
||||
public class LLMSqlResp {
|
||||
|
||||
private double sqlWeight;
|
||||
|
||||
private List<Map<String, String>> fewShots;
|
||||
|
||||
}
|
||||
@@ -0,0 +1,45 @@
|
||||
package com.tencent.supersonic.headless.core.chat.query.rule;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
public class QueryMatchOption {
|
||||
|
||||
private OptionType schemaElementOption;
|
||||
private RequireNumberType requireNumberType;
|
||||
private Integer requireNumber;
|
||||
|
||||
public static QueryMatchOption build(OptionType schemaElementOption,
|
||||
RequireNumberType requireNumberType, Integer requireNumber) {
|
||||
QueryMatchOption queryMatchOption = new QueryMatchOption();
|
||||
queryMatchOption.requireNumber = requireNumber;
|
||||
queryMatchOption.requireNumberType = requireNumberType;
|
||||
queryMatchOption.schemaElementOption = schemaElementOption;
|
||||
return queryMatchOption;
|
||||
}
|
||||
|
||||
public static QueryMatchOption optional() {
|
||||
QueryMatchOption queryMatchOption = new QueryMatchOption();
|
||||
queryMatchOption.setSchemaElementOption(OptionType.OPTIONAL);
|
||||
queryMatchOption.setRequireNumber(0);
|
||||
queryMatchOption.setRequireNumberType(RequireNumberType.AT_LEAST);
|
||||
return queryMatchOption;
|
||||
}
|
||||
|
||||
public static QueryMatchOption unused() {
|
||||
QueryMatchOption queryMatchOption = new QueryMatchOption();
|
||||
queryMatchOption.setSchemaElementOption(OptionType.UNUSED);
|
||||
queryMatchOption.setRequireNumber(0);
|
||||
queryMatchOption.setRequireNumberType(RequireNumberType.EQUAL);
|
||||
return queryMatchOption;
|
||||
}
|
||||
|
||||
public enum RequireNumberType {
|
||||
AT_MOST, AT_LEAST, EQUAL
|
||||
}
|
||||
|
||||
public enum OptionType {
|
||||
REQUIRED, OPTIONAL, UNUSED
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,106 @@
|
||||
package com.tencent.supersonic.headless.core.chat.query.rule;
|
||||
|
||||
|
||||
import com.tencent.supersonic.common.pojo.enums.AggregateTypeEnum;
|
||||
import com.tencent.supersonic.headless.api.pojo.SchemaElementMatch;
|
||||
import com.tencent.supersonic.headless.api.pojo.SchemaElementType;
|
||||
import lombok.Data;
|
||||
import lombok.ToString;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
|
||||
@Data
|
||||
@ToString
|
||||
public class QueryMatcher {
|
||||
|
||||
private HashMap<SchemaElementType, QueryMatchOption> elementOptionMap = new HashMap<>();
|
||||
private boolean supportCompare;
|
||||
private boolean supportOrderBy;
|
||||
private List<AggregateTypeEnum> orderByTypes = Arrays.asList(AggregateTypeEnum.MAX, AggregateTypeEnum.MIN,
|
||||
AggregateTypeEnum.TOPN);
|
||||
|
||||
public QueryMatcher() {
|
||||
for (SchemaElementType type : SchemaElementType.values()) {
|
||||
if (type.equals(SchemaElementType.DATASET)) {
|
||||
elementOptionMap.put(type, QueryMatchOption.optional());
|
||||
} else {
|
||||
elementOptionMap.put(type, QueryMatchOption.unused());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public QueryMatcher addOption(SchemaElementType type, QueryMatchOption.OptionType option,
|
||||
QueryMatchOption.RequireNumberType requireNumberType, Integer requireNumber) {
|
||||
elementOptionMap.put(type, QueryMatchOption.build(option, requireNumberType, requireNumber));
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Match schema element with current query according to the options.
|
||||
*
|
||||
* @param candidateElementMatches
|
||||
* @return a list of all matched schema elements,
|
||||
* empty list if no matches can be found
|
||||
*/
|
||||
public List<SchemaElementMatch> match(List<SchemaElementMatch> candidateElementMatches) {
|
||||
List<SchemaElementMatch> elementMatches = new ArrayList<>();
|
||||
HashMap<SchemaElementType, Integer> schemaElementTypeCount = new HashMap<>();
|
||||
for (SchemaElementMatch schemaElementMatch : candidateElementMatches) {
|
||||
SchemaElementType schemaElementType = schemaElementMatch.getElement().getType();
|
||||
if (schemaElementTypeCount.containsKey(schemaElementType)) {
|
||||
schemaElementTypeCount.put(schemaElementType, schemaElementTypeCount.get(schemaElementType) + 1);
|
||||
} else {
|
||||
schemaElementTypeCount.put(schemaElementType, 1);
|
||||
}
|
||||
}
|
||||
|
||||
// check if current query options are satisfied, return immediately if not
|
||||
for (Map.Entry<SchemaElementType, QueryMatchOption> e : elementOptionMap.entrySet()) {
|
||||
SchemaElementType elementType = e.getKey();
|
||||
QueryMatchOption elementOption = e.getValue();
|
||||
if (!isMatch(elementOption, getCount(schemaElementTypeCount, elementType))) {
|
||||
return new ArrayList<>();
|
||||
}
|
||||
}
|
||||
|
||||
// add element match if its element type is not declared as unused
|
||||
for (SchemaElementMatch elementMatch : candidateElementMatches) {
|
||||
QueryMatchOption elementOption = elementOptionMap.get(elementMatch.getElement().getType());
|
||||
if (Objects.nonNull(elementOption) && !elementOption.getSchemaElementOption()
|
||||
.equals(QueryMatchOption.OptionType.UNUSED)) {
|
||||
elementMatches.add(elementMatch);
|
||||
}
|
||||
}
|
||||
|
||||
return elementMatches;
|
||||
}
|
||||
|
||||
private int getCount(HashMap<SchemaElementType, Integer> schemaElementTypeCount,
|
||||
SchemaElementType schemaElementType) {
|
||||
if (schemaElementTypeCount.containsKey(schemaElementType)) {
|
||||
return schemaElementTypeCount.get(schemaElementType);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
private boolean isMatch(QueryMatchOption queryMatchOption, int count) {
|
||||
// check if required but empty
|
||||
if (queryMatchOption.getSchemaElementOption().equals(QueryMatchOption.OptionType.REQUIRED) && count <= 0) {
|
||||
return false;
|
||||
}
|
||||
if (queryMatchOption.getRequireNumberType().equals(QueryMatchOption.RequireNumberType.AT_LEAST)
|
||||
&& count < queryMatchOption.getRequireNumber()) {
|
||||
return false;
|
||||
}
|
||||
if (queryMatchOption.getRequireNumberType().equals(QueryMatchOption.RequireNumberType.AT_MOST)
|
||||
&& count > queryMatchOption.getRequireNumber()) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,242 @@
|
||||
|
||||
package com.tencent.supersonic.headless.core.chat.query.rule;
|
||||
|
||||
import com.tencent.supersonic.auth.api.authentication.pojo.User;
|
||||
import com.tencent.supersonic.common.pojo.enums.FilterOperatorEnum;
|
||||
import com.tencent.supersonic.headless.api.pojo.SchemaElement;
|
||||
import com.tencent.supersonic.headless.api.pojo.SchemaElementMatch;
|
||||
import com.tencent.supersonic.headless.api.pojo.SchemaElementType;
|
||||
import com.tencent.supersonic.headless.api.pojo.SemanticParseInfo;
|
||||
import com.tencent.supersonic.headless.api.pojo.SemanticSchema;
|
||||
import com.tencent.supersonic.headless.api.pojo.request.QueryFilter;
|
||||
import com.tencent.supersonic.headless.api.pojo.request.QueryMultiStructReq;
|
||||
import com.tencent.supersonic.headless.api.pojo.request.QueryStructReq;
|
||||
import com.tencent.supersonic.headless.api.pojo.request.SemanticQueryReq;
|
||||
import com.tencent.supersonic.headless.core.chat.query.BaseSemanticQuery;
|
||||
import com.tencent.supersonic.headless.core.chat.query.QueryManager;
|
||||
import com.tencent.supersonic.headless.core.pojo.ChatContext;
|
||||
import com.tencent.supersonic.headless.core.pojo.QueryContext;
|
||||
import com.tencent.supersonic.headless.core.utils.QueryReqBuilder;
|
||||
import lombok.ToString;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Map.Entry;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@Slf4j
|
||||
@ToString
|
||||
public abstract class RuleSemanticQuery extends BaseSemanticQuery {
|
||||
|
||||
protected QueryMatcher queryMatcher = new QueryMatcher();
|
||||
|
||||
public RuleSemanticQuery() {
|
||||
QueryManager.register(this);
|
||||
}
|
||||
|
||||
public List<SchemaElementMatch> match(List<SchemaElementMatch> candidateElementMatches,
|
||||
QueryContext queryCtx) {
|
||||
return queryMatcher.match(candidateElementMatches);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void initS2Sql(SemanticSchema semanticSchema, User user) {
|
||||
initS2SqlByStruct(semanticSchema);
|
||||
}
|
||||
|
||||
public void fillParseInfo(QueryContext queryContext, ChatContext chatContext) {
|
||||
parseInfo.setQueryMode(getQueryMode());
|
||||
SemanticSchema semanticSchema = queryContext.getSemanticSchema();
|
||||
|
||||
fillSchemaElement(parseInfo, semanticSchema);
|
||||
fillScore(parseInfo);
|
||||
fillDateConf(parseInfo, chatContext.getParseInfo());
|
||||
}
|
||||
|
||||
private void fillDateConf(SemanticParseInfo queryParseInfo, SemanticParseInfo chatParseInfo) {
|
||||
if (queryParseInfo.getDateInfo() != null || chatParseInfo.getDateInfo() == null) {
|
||||
return;
|
||||
}
|
||||
|
||||
if ((QueryManager.isTagQuery(queryParseInfo.getQueryMode())
|
||||
&& QueryManager.isTagQuery(chatParseInfo.getQueryMode()))
|
||||
|| (QueryManager.isMetricQuery(queryParseInfo.getQueryMode())
|
||||
&& QueryManager.isMetricQuery(chatParseInfo.getQueryMode()))) {
|
||||
// inherit date info from context
|
||||
queryParseInfo.setDateInfo(chatParseInfo.getDateInfo());
|
||||
queryParseInfo.getDateInfo().setInherited(true);
|
||||
}
|
||||
}
|
||||
|
||||
private void fillScore(SemanticParseInfo parseInfo) {
|
||||
double totalScore = 0;
|
||||
|
||||
Map<SchemaElementType, SchemaElementMatch> maxSimilarityMatch = new HashMap<>();
|
||||
for (SchemaElementMatch match : parseInfo.getElementMatches()) {
|
||||
SchemaElementType type = match.getElement().getType();
|
||||
if (!maxSimilarityMatch.containsKey(type)
|
||||
|| match.getSimilarity() > maxSimilarityMatch.get(type).getSimilarity()) {
|
||||
maxSimilarityMatch.put(type, match);
|
||||
}
|
||||
}
|
||||
|
||||
for (SchemaElementMatch match : maxSimilarityMatch.values()) {
|
||||
totalScore += match.getDetectWord().length() * match.getSimilarity();
|
||||
}
|
||||
|
||||
parseInfo.setScore(parseInfo.getScore() + totalScore);
|
||||
}
|
||||
|
||||
private void fillSchemaElement(SemanticParseInfo parseInfo, SemanticSchema semanticSchema) {
|
||||
Set<Long> dataSetIds = parseInfo.getElementMatches().stream().map(SchemaElementMatch::getElement)
|
||||
.map(SchemaElement::getDataSet).collect(Collectors.toSet());
|
||||
parseInfo.setDataSet(semanticSchema.getDataSet(dataSetIds.iterator().next()));
|
||||
Map<Long, List<SchemaElementMatch>> dim2Values = new HashMap<>();
|
||||
Map<Long, List<SchemaElementMatch>> id2Values = new HashMap<>();
|
||||
Map<Long, List<SchemaElementMatch>> tag2Values = new HashMap<>();
|
||||
|
||||
for (SchemaElementMatch schemaMatch : parseInfo.getElementMatches()) {
|
||||
SchemaElement element = schemaMatch.getElement();
|
||||
element.setOrder(1 - schemaMatch.getSimilarity());
|
||||
switch (element.getType()) {
|
||||
case ID:
|
||||
addToValues(semanticSchema, SchemaElementType.ENTITY, id2Values, schemaMatch);
|
||||
break;
|
||||
case TAG_VALUE:
|
||||
addToValues(semanticSchema, SchemaElementType.TAG, tag2Values, schemaMatch);
|
||||
break;
|
||||
case VALUE:
|
||||
addToValues(semanticSchema, SchemaElementType.DIMENSION, dim2Values, schemaMatch);
|
||||
break;
|
||||
case DIMENSION:
|
||||
parseInfo.getDimensions().add(element);
|
||||
break;
|
||||
case METRIC:
|
||||
parseInfo.getMetrics().add(element);
|
||||
break;
|
||||
case ENTITY:
|
||||
parseInfo.setEntity(element);
|
||||
break;
|
||||
default:
|
||||
}
|
||||
}
|
||||
|
||||
addToFilters(id2Values, parseInfo, semanticSchema, SchemaElementType.ENTITY);
|
||||
addToFilters(dim2Values, parseInfo, semanticSchema, SchemaElementType.DIMENSION);
|
||||
addToFilters(tag2Values, parseInfo, semanticSchema, SchemaElementType.TAG);
|
||||
}
|
||||
|
||||
private void addToFilters(Map<Long, List<SchemaElementMatch>> id2Values, SemanticParseInfo parseInfo,
|
||||
SemanticSchema semanticSchema, SchemaElementType entity) {
|
||||
if (id2Values == null || id2Values.isEmpty()) {
|
||||
return;
|
||||
}
|
||||
for (Entry<Long, List<SchemaElementMatch>> entry : id2Values.entrySet()) {
|
||||
SchemaElement dimension = semanticSchema.getElement(entity, entry.getKey());
|
||||
|
||||
if (entry.getValue().size() == 1) {
|
||||
SchemaElementMatch schemaMatch = entry.getValue().get(0);
|
||||
QueryFilter dimensionFilter = new QueryFilter();
|
||||
dimensionFilter.setValue(schemaMatch.getWord());
|
||||
dimensionFilter.setBizName(dimension.getBizName());
|
||||
dimensionFilter.setName(dimension.getName());
|
||||
dimensionFilter.setOperator(FilterOperatorEnum.EQUALS);
|
||||
dimensionFilter.setElementID(schemaMatch.getElement().getId());
|
||||
parseInfo.setEntity(semanticSchema.getElement(SchemaElementType.ENTITY, entry.getKey()));
|
||||
parseInfo.getDimensionFilters().add(dimensionFilter);
|
||||
} else {
|
||||
QueryFilter dimensionFilter = new QueryFilter();
|
||||
List<String> vals = new ArrayList<>();
|
||||
entry.getValue().stream().forEach(i -> vals.add(i.getWord()));
|
||||
dimensionFilter.setValue(vals);
|
||||
dimensionFilter.setBizName(dimension.getBizName());
|
||||
dimensionFilter.setName(dimension.getName());
|
||||
dimensionFilter.setOperator(FilterOperatorEnum.IN);
|
||||
dimensionFilter.setElementID(entry.getKey());
|
||||
parseInfo.getDimensionFilters().add(dimensionFilter);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void addToValues(SemanticSchema semanticSchema, SchemaElementType entity,
|
||||
Map<Long, List<SchemaElementMatch>> id2Values, SchemaElementMatch schemaMatch) {
|
||||
SchemaElement element = schemaMatch.getElement();
|
||||
SchemaElement entityElement = semanticSchema.getElement(entity, element.getId());
|
||||
if (entityElement != null) {
|
||||
if (id2Values.containsKey(element.getId())) {
|
||||
id2Values.get(element.getId()).add(schemaMatch);
|
||||
} else {
|
||||
id2Values.put(element.getId(), new ArrayList<>(Arrays.asList(schemaMatch)));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public SemanticQueryReq buildSemanticQueryReq() {
|
||||
String queryMode = parseInfo.getQueryMode();
|
||||
|
||||
if (parseInfo.getDataSetId() == null || StringUtils.isEmpty(queryMode)
|
||||
|| !QueryManager.containsRuleQuery(queryMode)) {
|
||||
// reach here some error may happen
|
||||
log.error("not find QueryMode");
|
||||
throw new RuntimeException("not find QueryMode");
|
||||
}
|
||||
|
||||
QueryStructReq queryStructReq = convertQueryStruct();
|
||||
return queryStructReq.convert(true);
|
||||
}
|
||||
|
||||
protected boolean isMultiStructQuery() {
|
||||
return false;
|
||||
}
|
||||
|
||||
public SemanticQueryReq multiStructExecute() {
|
||||
String queryMode = parseInfo.getQueryMode();
|
||||
|
||||
if (parseInfo.getDataSetId() != null || StringUtils.isEmpty(queryMode)
|
||||
|| !QueryManager.containsRuleQuery(queryMode)) {
|
||||
// reach here some error may happen
|
||||
log.error("not find QueryMode");
|
||||
throw new RuntimeException("not find QueryMode");
|
||||
}
|
||||
|
||||
return convertQueryMultiStruct();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setParseInfo(SemanticParseInfo parseInfo) {
|
||||
this.parseInfo = parseInfo;
|
||||
}
|
||||
|
||||
public static List<RuleSemanticQuery> resolve(List<SchemaElementMatch> candidateElementMatches,
|
||||
QueryContext queryContext) {
|
||||
List<RuleSemanticQuery> matchedQueries = new ArrayList<>();
|
||||
|
||||
for (RuleSemanticQuery semanticQuery : QueryManager.getRuleQueries()) {
|
||||
List<SchemaElementMatch> matches = semanticQuery.match(candidateElementMatches, queryContext);
|
||||
|
||||
if (matches.size() > 0) {
|
||||
RuleSemanticQuery query = QueryManager.createRuleQuery(semanticQuery.getQueryMode());
|
||||
query.getParseInfo().getElementMatches().addAll(matches);
|
||||
matchedQueries.add(query);
|
||||
}
|
||||
}
|
||||
|
||||
return matchedQueries;
|
||||
}
|
||||
|
||||
protected QueryStructReq convertQueryStruct() {
|
||||
return QueryReqBuilder.buildStructReq(parseInfo);
|
||||
}
|
||||
|
||||
protected QueryMultiStructReq convertQueryMultiStruct() {
|
||||
return QueryReqBuilder.buildMultiStructReq(parseInfo);
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,89 @@
|
||||
package com.tencent.supersonic.headless.core.chat.query.rule.metric;
|
||||
|
||||
import com.tencent.supersonic.common.pojo.Filter;
|
||||
import com.tencent.supersonic.common.pojo.enums.FilterOperatorEnum;
|
||||
import com.tencent.supersonic.common.pojo.enums.FilterType;
|
||||
import com.tencent.supersonic.headless.api.pojo.request.QueryMultiStructReq;
|
||||
import com.tencent.supersonic.headless.api.pojo.request.QueryStructReq;
|
||||
import com.tencent.supersonic.headless.api.pojo.request.SemanticQueryReq;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import static com.tencent.supersonic.headless.api.pojo.SchemaElementType.VALUE;
|
||||
import static com.tencent.supersonic.headless.core.chat.query.rule.QueryMatchOption.OptionType.REQUIRED;
|
||||
import static com.tencent.supersonic.headless.core.chat.query.rule.QueryMatchOption.RequireNumberType.AT_LEAST;
|
||||
|
||||
|
||||
@Slf4j
|
||||
@Component
|
||||
public class MetricFilterQuery extends MetricSemanticQuery {
|
||||
|
||||
public static final String QUERY_MODE = "METRIC_FILTER";
|
||||
|
||||
public MetricFilterQuery() {
|
||||
super();
|
||||
queryMatcher.addOption(VALUE, REQUIRED, AT_LEAST, 1);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getQueryMode() {
|
||||
return QUERY_MODE;
|
||||
}
|
||||
|
||||
@Override
|
||||
public SemanticQueryReq buildSemanticQueryReq() {
|
||||
if (!isMultiStructQuery()) {
|
||||
return super.buildSemanticQueryReq();
|
||||
}
|
||||
return super.multiStructExecute();
|
||||
}
|
||||
|
||||
protected boolean isMultiStructQuery() {
|
||||
Set<String> filterBizName = new HashSet<>();
|
||||
parseInfo.getDimensionFilters().forEach(filter ->
|
||||
filterBizName.add(filter.getBizName()));
|
||||
return FilterType.UNION.equals(parseInfo.getFilterType()) && filterBizName.size() > 1;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected QueryStructReq convertQueryStruct() {
|
||||
QueryStructReq queryStructReq = super.convertQueryStruct();
|
||||
addDimension(queryStructReq, true);
|
||||
return queryStructReq;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected QueryMultiStructReq convertQueryMultiStruct() {
|
||||
QueryMultiStructReq queryMultiStructReq = super.convertQueryMultiStruct();
|
||||
for (QueryStructReq queryStructReq : queryMultiStructReq.getQueryStructReqs()) {
|
||||
addDimension(queryStructReq, false);
|
||||
}
|
||||
return queryMultiStructReq;
|
||||
}
|
||||
|
||||
private void addDimension(QueryStructReq queryStructReq, boolean onlyOperateInFilter) {
|
||||
if (!queryStructReq.getDimensionFilters().isEmpty()) {
|
||||
List<String> dimensions = queryStructReq.getGroups();
|
||||
log.info("addDimension before [{}]", queryStructReq.getGroups());
|
||||
List<Filter> filters = new ArrayList<>(queryStructReq.getDimensionFilters());
|
||||
if (onlyOperateInFilter) {
|
||||
filters = filters.stream().filter(filter
|
||||
-> filter.getOperator().equals(FilterOperatorEnum.IN)).collect(Collectors.toList());
|
||||
}
|
||||
filters.forEach(d -> {
|
||||
if (!dimensions.contains(d.getBizName())) {
|
||||
dimensions.add(d.getBizName());
|
||||
}
|
||||
});
|
||||
queryStructReq.setGroups(dimensions);
|
||||
log.info("addDimension after [{}]", queryStructReq.getGroups());
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,27 @@
|
||||
package com.tencent.supersonic.headless.core.chat.query.rule.metric;
|
||||
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import static com.tencent.supersonic.headless.api.pojo.SchemaElementType.DIMENSION;
|
||||
import static com.tencent.supersonic.headless.api.pojo.SchemaElementType.VALUE;
|
||||
import static com.tencent.supersonic.headless.core.chat.query.rule.QueryMatchOption.OptionType.OPTIONAL;
|
||||
import static com.tencent.supersonic.headless.core.chat.query.rule.QueryMatchOption.OptionType.REQUIRED;
|
||||
import static com.tencent.supersonic.headless.core.chat.query.rule.QueryMatchOption.RequireNumberType.AT_LEAST;
|
||||
|
||||
@Component
|
||||
public class MetricGroupByQuery extends MetricSemanticQuery {
|
||||
|
||||
public static final String QUERY_MODE = "METRIC_GROUPBY";
|
||||
|
||||
public MetricGroupByQuery() {
|
||||
super();
|
||||
queryMatcher.addOption(DIMENSION, REQUIRED, AT_LEAST, 1);
|
||||
queryMatcher.addOption(VALUE, OPTIONAL, AT_LEAST, 0);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getQueryMode() {
|
||||
return QUERY_MODE;
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,23 @@
|
||||
package com.tencent.supersonic.headless.core.chat.query.rule.metric;
|
||||
|
||||
import com.tencent.supersonic.headless.api.pojo.SchemaElementType;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import static com.tencent.supersonic.headless.core.chat.query.rule.QueryMatchOption.OptionType.OPTIONAL;
|
||||
import static com.tencent.supersonic.headless.core.chat.query.rule.QueryMatchOption.RequireNumberType.AT_MOST;
|
||||
@Component
|
||||
public class MetricModelQuery extends MetricSemanticQuery {
|
||||
|
||||
public static final String QUERY_MODE = "METRIC_MODEL";
|
||||
|
||||
public MetricModelQuery() {
|
||||
super();
|
||||
queryMatcher.addOption(SchemaElementType.DATASET, OPTIONAL, AT_MOST, 1);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getQueryMode() {
|
||||
return QUERY_MODE;
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,64 @@
|
||||
package com.tencent.supersonic.headless.core.chat.query.rule.metric;
|
||||
|
||||
|
||||
import com.tencent.supersonic.common.pojo.DateConf;
|
||||
import com.tencent.supersonic.common.pojo.enums.TimeMode;
|
||||
import com.tencent.supersonic.headless.api.pojo.DataSetSchema;
|
||||
import com.tencent.supersonic.headless.api.pojo.SchemaElementMatch;
|
||||
import com.tencent.supersonic.headless.api.pojo.TimeDefaultConfig;
|
||||
import com.tencent.supersonic.headless.core.pojo.ChatContext;
|
||||
import com.tencent.supersonic.headless.core.pojo.QueryContext;
|
||||
import com.tencent.supersonic.headless.core.chat.query.rule.RuleSemanticQuery;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
import java.time.LocalDate;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
|
||||
import static com.tencent.supersonic.headless.api.pojo.SchemaElementType.METRIC;
|
||||
import static com.tencent.supersonic.headless.core.chat.query.rule.QueryMatchOption.OptionType.REQUIRED;
|
||||
import static com.tencent.supersonic.headless.core.chat.query.rule.QueryMatchOption.RequireNumberType.AT_LEAST;
|
||||
@Slf4j
|
||||
public abstract class MetricSemanticQuery extends RuleSemanticQuery {
|
||||
|
||||
private static final Long METRIC_MAX_RESULTS = 365L;
|
||||
|
||||
public MetricSemanticQuery() {
|
||||
super();
|
||||
queryMatcher.addOption(METRIC, REQUIRED, AT_LEAST, 1);
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<SchemaElementMatch> match(List<SchemaElementMatch> candidateElementMatches,
|
||||
QueryContext queryCtx) {
|
||||
return super.match(candidateElementMatches, queryCtx);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void fillParseInfo(QueryContext queryContext, ChatContext chatContext) {
|
||||
super.fillParseInfo(queryContext, chatContext);
|
||||
parseInfo.setLimit(METRIC_MAX_RESULTS);
|
||||
if (parseInfo.getDateInfo() == null) {
|
||||
DataSetSchema dataSetSchema =
|
||||
queryContext.getSemanticSchema().getDataSetSchemaMap().get(parseInfo.getDataSetId());
|
||||
TimeDefaultConfig timeDefaultConfig = dataSetSchema.getMetricTypeTimeDefaultConfig();
|
||||
DateConf dateInfo = new DateConf();
|
||||
if (Objects.nonNull(timeDefaultConfig) && Objects.nonNull(timeDefaultConfig.getUnit())) {
|
||||
int unit = timeDefaultConfig.getUnit();
|
||||
String startDate = LocalDate.now().plusDays(-unit).toString();
|
||||
String endDate = startDate;
|
||||
if (TimeMode.LAST.equals(timeDefaultConfig.getTimeMode())) {
|
||||
dateInfo.setDateMode(DateConf.DateMode.BETWEEN);
|
||||
} else if (TimeMode.RECENT.equals(timeDefaultConfig.getTimeMode())) {
|
||||
dateInfo.setDateMode(DateConf.DateMode.RECENT);
|
||||
endDate = LocalDate.now().plusDays(-1).toString();
|
||||
}
|
||||
dateInfo.setUnit(unit);
|
||||
dateInfo.setPeriod(timeDefaultConfig.getPeriod());
|
||||
dateInfo.setStartDate(startDate);
|
||||
dateInfo.setEndDate(endDate);
|
||||
}
|
||||
parseInfo.setDateInfo(dateInfo);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,90 @@
|
||||
package com.tencent.supersonic.headless.core.chat.query.rule.metric;
|
||||
|
||||
import com.tencent.supersonic.common.pojo.Filter;
|
||||
import com.tencent.supersonic.common.pojo.enums.FilterOperatorEnum;
|
||||
import com.tencent.supersonic.common.pojo.enums.FilterType;
|
||||
import com.tencent.supersonic.headless.api.pojo.request.QueryMultiStructReq;
|
||||
import com.tencent.supersonic.headless.api.pojo.request.QueryStructReq;
|
||||
import com.tencent.supersonic.headless.api.pojo.request.SemanticQueryReq;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import static com.tencent.supersonic.headless.api.pojo.SchemaElementType.ENTITY;
|
||||
import static com.tencent.supersonic.headless.api.pojo.SchemaElementType.ID;
|
||||
import static com.tencent.supersonic.headless.core.chat.query.rule.QueryMatchOption.OptionType.REQUIRED;
|
||||
import static com.tencent.supersonic.headless.core.chat.query.rule.QueryMatchOption.RequireNumberType.AT_LEAST;
|
||||
@Slf4j
|
||||
@Component
|
||||
public class MetricTagQuery extends MetricSemanticQuery {
|
||||
|
||||
public static final String QUERY_MODE = "METRIC_TAG";
|
||||
|
||||
public MetricTagQuery() {
|
||||
super();
|
||||
queryMatcher.addOption(ID, REQUIRED, AT_LEAST, 1)
|
||||
.addOption(ENTITY, REQUIRED, AT_LEAST, 1);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getQueryMode() {
|
||||
return QUERY_MODE;
|
||||
}
|
||||
|
||||
@Override
|
||||
public SemanticQueryReq buildSemanticQueryReq() {
|
||||
if (!isMultiStructQuery()) {
|
||||
return super.buildSemanticQueryReq();
|
||||
}
|
||||
return super.multiStructExecute();
|
||||
}
|
||||
|
||||
protected boolean isMultiStructQuery() {
|
||||
Set<String> filterBizName = new HashSet<>();
|
||||
parseInfo.getDimensionFilters().stream()
|
||||
.filter(filter -> filter.getElementID() != null)
|
||||
.forEach(filter -> filterBizName.add(filter.getBizName()));
|
||||
return FilterType.UNION.equals(parseInfo.getFilterType()) && filterBizName.size() > 1;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected QueryStructReq convertQueryStruct() {
|
||||
QueryStructReq queryStructReq = super.convertQueryStruct();
|
||||
addDimension(queryStructReq, true);
|
||||
return queryStructReq;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected QueryMultiStructReq convertQueryMultiStruct() {
|
||||
QueryMultiStructReq queryMultiStructReq = super.convertQueryMultiStruct();
|
||||
for (QueryStructReq queryStructReq : queryMultiStructReq.getQueryStructReqs()) {
|
||||
addDimension(queryStructReq, false);
|
||||
}
|
||||
return queryMultiStructReq;
|
||||
}
|
||||
|
||||
private void addDimension(QueryStructReq queryStructReq, boolean onlyOperateInFilter) {
|
||||
if (!queryStructReq.getDimensionFilters().isEmpty()) {
|
||||
List<String> dimensions = queryStructReq.getGroups();
|
||||
log.info("addDimension before [{}]", queryStructReq.getGroups());
|
||||
List<Filter> filters = new ArrayList<>(queryStructReq.getDimensionFilters());
|
||||
if (onlyOperateInFilter) {
|
||||
filters = filters.stream().filter(filter
|
||||
-> filter.getOperator().equals(FilterOperatorEnum.IN)).collect(Collectors.toList());
|
||||
}
|
||||
filters.forEach(d -> {
|
||||
if (!dimensions.contains(d.getBizName())) {
|
||||
dimensions.add(d.getBizName());
|
||||
}
|
||||
});
|
||||
queryStructReq.setGroups(dimensions);
|
||||
log.info("addDimension after [{}]", queryStructReq.getGroups());
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,64 @@
|
||||
package com.tencent.supersonic.headless.core.chat.query.rule.metric;
|
||||
|
||||
import com.tencent.supersonic.common.pojo.Order;
|
||||
import com.tencent.supersonic.common.pojo.enums.AggregateTypeEnum;
|
||||
import com.tencent.supersonic.headless.api.pojo.SchemaElement;
|
||||
import com.tencent.supersonic.headless.api.pojo.SchemaElementMatch;
|
||||
import com.tencent.supersonic.headless.core.pojo.ChatContext;
|
||||
import com.tencent.supersonic.headless.core.pojo.QueryContext;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import static com.tencent.supersonic.common.pojo.Constants.DESC_UPPER;
|
||||
import static com.tencent.supersonic.headless.api.pojo.SchemaElementType.DIMENSION;
|
||||
import static com.tencent.supersonic.headless.api.pojo.SchemaElementType.VALUE;
|
||||
import static com.tencent.supersonic.headless.core.chat.query.rule.QueryMatchOption.OptionType.OPTIONAL;
|
||||
import static com.tencent.supersonic.headless.core.chat.query.rule.QueryMatchOption.OptionType.REQUIRED;
|
||||
import static com.tencent.supersonic.headless.core.chat.query.rule.QueryMatchOption.RequireNumberType.AT_LEAST;
|
||||
|
||||
@Component
|
||||
public class MetricTopNQuery extends MetricSemanticQuery {
|
||||
|
||||
public static final String QUERY_MODE = "METRIC_ORDERBY";
|
||||
private static final Long ORDERBY_MAX_RESULTS = 3L;
|
||||
private static final Pattern INTENT_PATTERN = Pattern.compile("(.*)(最大|最高|最多)(.*)");
|
||||
|
||||
public MetricTopNQuery() {
|
||||
super();
|
||||
queryMatcher.addOption(DIMENSION, REQUIRED, AT_LEAST, 1);
|
||||
queryMatcher.addOption(VALUE, OPTIONAL, AT_LEAST, 0);
|
||||
queryMatcher.setSupportOrderBy(true);
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<SchemaElementMatch> match(List<SchemaElementMatch> candidateElementMatches,
|
||||
QueryContext queryCtx) {
|
||||
Matcher matcher = INTENT_PATTERN.matcher(queryCtx.getQueryText());
|
||||
if (matcher.matches()) {
|
||||
return super.match(candidateElementMatches, queryCtx);
|
||||
}
|
||||
return new ArrayList<>();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getQueryMode() {
|
||||
return QUERY_MODE;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void fillParseInfo(QueryContext queryContext, ChatContext chatContext) {
|
||||
super.fillParseInfo(queryContext, chatContext);
|
||||
|
||||
parseInfo.setLimit(ORDERBY_MAX_RESULTS);
|
||||
parseInfo.setScore(parseInfo.getScore() + 2.0);
|
||||
parseInfo.setAggType(AggregateTypeEnum.SUM);
|
||||
|
||||
SchemaElement metric = parseInfo.getMetrics().iterator().next();
|
||||
parseInfo.getOrders().add(new Order(metric.getBizName(), DESC_UPPER));
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,26 @@
|
||||
package com.tencent.supersonic.headless.core.chat.query.rule.tag;
|
||||
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import static com.tencent.supersonic.headless.api.pojo.SchemaElementType.TAG;
|
||||
import static com.tencent.supersonic.headless.core.chat.query.rule.QueryMatchOption.OptionType.REQUIRED;
|
||||
import static com.tencent.supersonic.headless.core.chat.query.rule.QueryMatchOption.RequireNumberType.AT_LEAST;
|
||||
import static com.tencent.supersonic.headless.api.pojo.SchemaElementType.ID;
|
||||
|
||||
@Component
|
||||
public class TagDetailQuery extends TagSemanticQuery {
|
||||
|
||||
public static final String QUERY_MODE = "TAG_DETAIL";
|
||||
|
||||
public TagDetailQuery() {
|
||||
super();
|
||||
queryMatcher.addOption(TAG, REQUIRED, AT_LEAST, 1)
|
||||
.addOption(ID, REQUIRED, AT_LEAST, 1);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getQueryMode() {
|
||||
return QUERY_MODE;
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,29 @@
|
||||
package com.tencent.supersonic.headless.core.chat.query.rule.tag;
|
||||
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import static com.tencent.supersonic.headless.api.pojo.SchemaElementType.TAG;
|
||||
import static com.tencent.supersonic.headless.api.pojo.SchemaElementType.TAG_VALUE;
|
||||
import static com.tencent.supersonic.headless.core.chat.query.rule.QueryMatchOption.OptionType.OPTIONAL;
|
||||
import static com.tencent.supersonic.headless.core.chat.query.rule.QueryMatchOption.OptionType.REQUIRED;
|
||||
import static com.tencent.supersonic.headless.core.chat.query.rule.QueryMatchOption.RequireNumberType.AT_LEAST;
|
||||
|
||||
@Slf4j
|
||||
@Component
|
||||
public class TagFilterQuery extends TagListQuery {
|
||||
|
||||
public static final String QUERY_MODE = "TAG_LIST_FILTER";
|
||||
|
||||
public TagFilterQuery() {
|
||||
super();
|
||||
queryMatcher.addOption(TAG, OPTIONAL, AT_LEAST, 0);
|
||||
queryMatcher.addOption(TAG_VALUE, REQUIRED, AT_LEAST, 1);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getQueryMode() {
|
||||
return QUERY_MODE;
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,24 @@
|
||||
package com.tencent.supersonic.headless.core.chat.query.rule.tag;
|
||||
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import static com.tencent.supersonic.headless.core.chat.query.rule.QueryMatchOption.OptionType.REQUIRED;
|
||||
import static com.tencent.supersonic.headless.core.chat.query.rule.QueryMatchOption.RequireNumberType.AT_LEAST;
|
||||
import static com.tencent.supersonic.headless.api.pojo.SchemaElementType.ID;
|
||||
|
||||
@Component
|
||||
public class TagIdQuery extends TagListQuery {
|
||||
|
||||
public static final String QUERY_MODE = "TAG_ID";
|
||||
|
||||
public TagIdQuery() {
|
||||
super();
|
||||
queryMatcher.addOption(ID, REQUIRED, AT_LEAST, 1);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getQueryMode() {
|
||||
return QUERY_MODE;
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,60 @@
|
||||
package com.tencent.supersonic.headless.core.chat.query.rule.tag;
|
||||
|
||||
import com.tencent.supersonic.common.pojo.Constants;
|
||||
import com.tencent.supersonic.common.pojo.Order;
|
||||
import com.tencent.supersonic.headless.api.pojo.DataSetSchema;
|
||||
import com.tencent.supersonic.headless.api.pojo.SchemaElement;
|
||||
import com.tencent.supersonic.headless.api.pojo.SchemaElementType;
|
||||
import com.tencent.supersonic.headless.api.pojo.SemanticParseInfo;
|
||||
import com.tencent.supersonic.headless.api.pojo.TagTypeDefaultConfig;
|
||||
import com.tencent.supersonic.headless.core.pojo.ChatContext;
|
||||
import com.tencent.supersonic.headless.core.pojo.QueryContext;
|
||||
import org.apache.commons.collections.CollectionUtils;
|
||||
|
||||
import java.util.LinkedHashSet;
|
||||
import java.util.Objects;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
public abstract class TagListQuery extends TagSemanticQuery {
|
||||
|
||||
@Override
|
||||
public void fillParseInfo(QueryContext queryContext, ChatContext chatContext) {
|
||||
super.fillParseInfo(queryContext, chatContext);
|
||||
this.addEntityDetailAndOrderByMetric(queryContext, parseInfo);
|
||||
}
|
||||
|
||||
private void addEntityDetailAndOrderByMetric(QueryContext queryContext, SemanticParseInfo parseInfo) {
|
||||
Long dataSetId = parseInfo.getDataSetId();
|
||||
if (Objects.nonNull(dataSetId) && dataSetId > 0L) {
|
||||
DataSetSchema dataSetSchema = queryContext.getSemanticSchema().getDataSetSchemaMap().get(dataSetId);
|
||||
if (dataSetSchema != null && Objects.nonNull(dataSetSchema.getEntity())) {
|
||||
Set<SchemaElement> dimensions = new LinkedHashSet<>();
|
||||
Set<SchemaElement> metrics = new LinkedHashSet<>();
|
||||
Set<Order> orders = new LinkedHashSet<>();
|
||||
TagTypeDefaultConfig tagTypeDefaultConfig = dataSetSchema.getTagTypeDefaultConfig();
|
||||
if (tagTypeDefaultConfig != null && tagTypeDefaultConfig.getDefaultDisplayInfo() != null) {
|
||||
if (CollectionUtils.isNotEmpty(tagTypeDefaultConfig.getDefaultDisplayInfo().getMetricIds())) {
|
||||
metrics = tagTypeDefaultConfig.getDefaultDisplayInfo().getMetricIds()
|
||||
.stream().map(id -> {
|
||||
SchemaElement metric = dataSetSchema.getElement(SchemaElementType.METRIC, id);
|
||||
if (metric != null) {
|
||||
orders.add(new Order(metric.getBizName(), Constants.DESC_UPPER));
|
||||
}
|
||||
return metric;
|
||||
}).filter(Objects::nonNull).collect(Collectors.toSet());
|
||||
}
|
||||
if (CollectionUtils.isNotEmpty(tagTypeDefaultConfig.getDefaultDisplayInfo().getDimensionIds())) {
|
||||
dimensions = tagTypeDefaultConfig.getDefaultDisplayInfo().getDimensionIds().stream()
|
||||
.map(id -> dataSetSchema.getElement(SchemaElementType.DIMENSION, id))
|
||||
.filter(Objects::nonNull).collect(Collectors.toSet());
|
||||
}
|
||||
}
|
||||
parseInfo.setDimensions(dimensions);
|
||||
parseInfo.setMetrics(metrics);
|
||||
parseInfo.setOrders(orders);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,68 @@
|
||||
package com.tencent.supersonic.headless.core.chat.query.rule.tag;
|
||||
|
||||
import com.tencent.supersonic.common.pojo.DateConf;
|
||||
import com.tencent.supersonic.common.pojo.enums.QueryType;
|
||||
import com.tencent.supersonic.common.pojo.enums.TimeMode;
|
||||
import com.tencent.supersonic.headless.api.pojo.DataSetSchema;
|
||||
import com.tencent.supersonic.headless.api.pojo.SchemaElementMatch;
|
||||
import com.tencent.supersonic.headless.api.pojo.TimeDefaultConfig;
|
||||
import com.tencent.supersonic.headless.core.pojo.ChatContext;
|
||||
import com.tencent.supersonic.headless.core.pojo.QueryContext;
|
||||
import com.tencent.supersonic.headless.core.chat.query.rule.RuleSemanticQuery;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
import java.time.LocalDate;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
|
||||
import static com.tencent.supersonic.headless.api.pojo.SchemaElementType.ENTITY;
|
||||
import static com.tencent.supersonic.headless.core.chat.query.rule.QueryMatchOption.OptionType.REQUIRED;
|
||||
import static com.tencent.supersonic.headless.core.chat.query.rule.QueryMatchOption.RequireNumberType.AT_LEAST;
|
||||
|
||||
@Slf4j
|
||||
public abstract class TagSemanticQuery extends RuleSemanticQuery {
|
||||
|
||||
private static final Long TAG_MAX_RESULTS = 500L;
|
||||
|
||||
public TagSemanticQuery() {
|
||||
super();
|
||||
queryMatcher.addOption(ENTITY, REQUIRED, AT_LEAST, 1);
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<SchemaElementMatch> match(List<SchemaElementMatch> candidateElementMatches,
|
||||
QueryContext queryCtx) {
|
||||
return super.match(candidateElementMatches, queryCtx);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void fillParseInfo(QueryContext queryContext, ChatContext chatContext) {
|
||||
super.fillParseInfo(queryContext, chatContext);
|
||||
|
||||
parseInfo.setQueryType(QueryType.TAG);
|
||||
parseInfo.setLimit(TAG_MAX_RESULTS);
|
||||
if (parseInfo.getDateInfo() == null) {
|
||||
DataSetSchema dataSetSchema =
|
||||
queryContext.getSemanticSchema().getDataSetSchemaMap().get(parseInfo.getDataSetId());
|
||||
TimeDefaultConfig timeDefaultConfig = dataSetSchema.getTagTypeTimeDefaultConfig();
|
||||
DateConf dateInfo = new DateConf();
|
||||
if (Objects.nonNull(timeDefaultConfig) && Objects.nonNull(timeDefaultConfig.getUnit())) {
|
||||
int unit = timeDefaultConfig.getUnit();
|
||||
String startDate = LocalDate.now().plusDays(-unit).toString();
|
||||
String endDate = startDate;
|
||||
if (TimeMode.LAST.equals(timeDefaultConfig.getTimeMode())) {
|
||||
dateInfo.setDateMode(DateConf.DateMode.BETWEEN);
|
||||
} else if (TimeMode.RECENT.equals(timeDefaultConfig.getTimeMode())) {
|
||||
dateInfo.setDateMode(DateConf.DateMode.RECENT);
|
||||
endDate = LocalDate.now().plusDays(-1).toString();
|
||||
}
|
||||
dateInfo.setUnit(unit);
|
||||
dateInfo.setPeriod(timeDefaultConfig.getPeriod());
|
||||
dateInfo.setStartDate(startDate);
|
||||
dateInfo.setEndDate(endDate);
|
||||
}
|
||||
parseInfo.setDateInfo(dateInfo);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,13 @@
|
||||
package com.tencent.supersonic.headless.core.config;
|
||||
|
||||
import lombok.Data;
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.context.annotation.Configuration;
|
||||
@Data
|
||||
@Configuration
|
||||
public class AggregatorConfig {
|
||||
@Value("${metric.aggregator.ratio.enable:true}")
|
||||
private Boolean enableRatio;
|
||||
}
|
||||
|
||||
|
||||
@@ -0,0 +1,41 @@
|
||||
package com.tencent.supersonic.headless.core.config;
|
||||
|
||||
import com.tencent.supersonic.headless.core.knowledge.helper.HanlpHelper;
|
||||
import lombok.Data;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.context.annotation.Configuration;
|
||||
|
||||
import java.io.FileNotFoundException;
|
||||
|
||||
|
||||
@Data
|
||||
@Configuration
|
||||
@Slf4j
|
||||
public class ChatLocalFileConfig {
|
||||
|
||||
|
||||
@Value("${dict.directory.latest:/data/dictionary/custom}")
|
||||
private String dictDirectoryLatest;
|
||||
|
||||
@Value("${dict.directory.backup:./dict/backup}")
|
||||
private String dictDirectoryBackup;
|
||||
|
||||
public String getDictDirectoryLatest() {
|
||||
return getResourceDir() + dictDirectoryLatest;
|
||||
}
|
||||
|
||||
public String getDictDirectoryBackup() {
|
||||
return dictDirectoryBackup;
|
||||
}
|
||||
|
||||
private String getResourceDir() {
|
||||
String hanlpPropertiesPath = "";
|
||||
try {
|
||||
hanlpPropertiesPath = HanlpHelper.getHanlpPropertiesPath();
|
||||
} catch (FileNotFoundException e) {
|
||||
log.warn("getResourceDir, e:", e);
|
||||
}
|
||||
return hanlpPropertiesPath;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,42 @@
|
||||
package com.tencent.supersonic.headless.core.config;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
@Data
|
||||
@AllArgsConstructor
|
||||
@NoArgsConstructor
|
||||
public class DefaultMetric {
|
||||
|
||||
/**
|
||||
* default metrics
|
||||
*/
|
||||
private Long metricId;
|
||||
|
||||
/**
|
||||
* default time span unit
|
||||
*/
|
||||
private Integer unit;
|
||||
|
||||
/**
|
||||
* default time type: DAY
|
||||
* DAY, WEEK, MONTH, YEAR
|
||||
*/
|
||||
private String period;
|
||||
|
||||
private String bizName;
|
||||
private String name;
|
||||
|
||||
public DefaultMetric(Long metricId, Integer unit, String period) {
|
||||
this.metricId = metricId;
|
||||
this.unit = unit;
|
||||
this.period = period;
|
||||
}
|
||||
|
||||
public DefaultMetric(String bizName, Integer unit, String period) {
|
||||
this.bizName = bizName;
|
||||
this.unit = unit;
|
||||
this.period = period;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,32 @@
|
||||
package com.tencent.supersonic.headless.core.config;
|
||||
|
||||
import com.tencent.supersonic.common.pojo.Constants;
|
||||
import lombok.Data;
|
||||
import lombok.ToString;
|
||||
|
||||
|
||||
/**
|
||||
* default metrics about the model
|
||||
*/
|
||||
|
||||
@ToString
|
||||
@Data
|
||||
public class DefaultMetricInfo {
|
||||
|
||||
/**
|
||||
* default metrics
|
||||
*/
|
||||
private Long metricId;
|
||||
|
||||
/**
|
||||
* default time span unit
|
||||
*/
|
||||
private Integer unit = 1;
|
||||
|
||||
/**
|
||||
* default time type: day
|
||||
* DAY, WEEK, MONTH, YEAR
|
||||
*/
|
||||
private String period = Constants.DAY;
|
||||
|
||||
}
|
||||
@@ -0,0 +1,44 @@
|
||||
package com.tencent.supersonic.headless.core.config;
|
||||
|
||||
import lombok.Data;
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.context.annotation.Configuration;
|
||||
|
||||
@Configuration
|
||||
@Data
|
||||
public class DefaultSemanticConfig {
|
||||
|
||||
@Value("${semantic.url.prefix:http://localhost:8081}")
|
||||
private String semanticUrl;
|
||||
|
||||
@Value("${searchByStruct.path:/api/semantic/query/struct}")
|
||||
private String searchByStructPath;
|
||||
|
||||
@Value("${searchByStruct.path:/api/semantic/query/multiStruct}")
|
||||
private String searchByMultiStructPath;
|
||||
|
||||
@Value("${searchByStruct.path:/api/semantic/query/sql}")
|
||||
private String searchBySqlPath;
|
||||
|
||||
@Value("${searchByStruct.path:/api/semantic/query/queryDimValue}")
|
||||
private String queryDimValuePath;
|
||||
|
||||
@Value("${fetchModelSchemaPath.path:/api/semantic/schema}")
|
||||
private String fetchModelSchemaPath;
|
||||
|
||||
@Value("${fetchModelList.path:/api/semantic/schema/dimension/page}")
|
||||
private String fetchDimensionPagePath;
|
||||
|
||||
@Value("${fetchModelList.path:/api/semantic/schema/metric/page}")
|
||||
private String fetchMetricPagePath;
|
||||
|
||||
@Value("${fetchModelList.path:/api/semantic/schema/domain/list}")
|
||||
private String fetchDomainListPath;
|
||||
|
||||
@Value("${fetchModelList.path:/api/semantic/schema/model/list}")
|
||||
private String fetchModelListPath;
|
||||
|
||||
@Value("${explain.path:/api/semantic/query/explain}")
|
||||
private String explainPath;
|
||||
|
||||
}
|
||||
@@ -0,0 +1,22 @@
|
||||
package com.tencent.supersonic.headless.core.config;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.ToString;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
@Data
|
||||
@AllArgsConstructor
|
||||
@ToString
|
||||
@NoArgsConstructor
|
||||
public class Dim4Dict {
|
||||
|
||||
private Long dimId;
|
||||
private String bizName;
|
||||
private List<String> blackList;
|
||||
private List<String> whiteList;
|
||||
private List<String> ruleList;
|
||||
|
||||
}
|
||||
@@ -0,0 +1,16 @@
|
||||
package com.tencent.supersonic.headless.core.config;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* when query an entity, return related dimension/metric info
|
||||
*/
|
||||
@Data
|
||||
public class EntityDetailData {
|
||||
|
||||
private List<Long> dimensionIds;
|
||||
private List<Long> metricIds;
|
||||
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
package com.tencent.supersonic.headless.core.config;
|
||||
|
||||
|
||||
import com.tencent.supersonic.headless.api.pojo.response.DimSchemaResp;
|
||||
import com.tencent.supersonic.headless.api.pojo.response.MetricSchemaResp;
|
||||
import lombok.Data;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
@Data
|
||||
public class EntityInternalDetail {
|
||||
|
||||
List<DimSchemaResp> dimensionList;
|
||||
List<MetricSchemaResp> metricList;
|
||||
}
|
||||
@@ -0,0 +1,30 @@
|
||||
package com.tencent.supersonic.headless.core.config;
|
||||
|
||||
|
||||
import lombok.Data;
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.context.annotation.Configuration;
|
||||
|
||||
@Configuration
|
||||
@Data
|
||||
public class LLMParserConfig {
|
||||
|
||||
|
||||
@Value("${llm.parser.url:}")
|
||||
private String url;
|
||||
|
||||
@Value("${query2sql.path:/query2sql}")
|
||||
private String queryToSqlPath;
|
||||
|
||||
@Value("${dimension.topn:10}")
|
||||
private Integer dimensionTopN;
|
||||
|
||||
@Value("${metric.topn:10}")
|
||||
private Integer metricTopN;
|
||||
|
||||
@Value("${tag.topn:20}")
|
||||
private Integer tagTopN;
|
||||
|
||||
@Value("${all.model:false}")
|
||||
private Boolean allModel;
|
||||
}
|
||||
@@ -0,0 +1,175 @@
|
||||
package com.tencent.supersonic.headless.core.config;
|
||||
|
||||
import com.tencent.supersonic.common.service.SysParameterService;
|
||||
import com.tencent.supersonic.headless.core.chat.query.llm.s2sql.LLMReq.SqlGenerationMode;
|
||||
import lombok.Data;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.context.annotation.Configuration;
|
||||
|
||||
@Configuration
|
||||
@Data
|
||||
@Slf4j
|
||||
public class OptimizationConfig {
|
||||
|
||||
@Value("${one.detection.size:8}")
|
||||
private Integer oneDetectionSize;
|
||||
|
||||
@Value("${one.detection.max.size:20}")
|
||||
private Integer oneDetectionMaxSize;
|
||||
|
||||
@Value("${metric.dimension.min.threshold:0.3}")
|
||||
private Double metricDimensionMinThresholdConfig;
|
||||
|
||||
@Value("${metric.dimension.threshold:0.3}")
|
||||
private Double metricDimensionThresholdConfig;
|
||||
|
||||
@Value("${dimension.value.threshold:0.5}")
|
||||
private Double dimensionValueThresholdConfig;
|
||||
|
||||
@Value("${long.text.threshold:0.8}")
|
||||
private Double longTextThreshold;
|
||||
|
||||
@Value("${short.text.threshold:0.5}")
|
||||
private Double shortTextThreshold;
|
||||
|
||||
@Value("${query.text.length.threshold:10}")
|
||||
private Integer queryTextLengthThreshold;
|
||||
@Value("${embedding.mapper.word.min:4}")
|
||||
private int embeddingMapperWordMin;
|
||||
|
||||
@Value("${embedding.mapper.word.max:5}")
|
||||
private int embeddingMapperWordMax;
|
||||
|
||||
@Value("${embedding.mapper.batch:50}")
|
||||
private int embeddingMapperBatch;
|
||||
|
||||
@Value("${embedding.mapper.number:5}")
|
||||
private int embeddingMapperNumber;
|
||||
|
||||
@Value("${embedding.mapper.round.number:10}")
|
||||
private int embeddingMapperRoundNumber;
|
||||
|
||||
@Value("${embedding.mapper.distance.threshold:0.01}")
|
||||
private Double embeddingMapperDistanceThreshold;
|
||||
|
||||
@Value("${s2SQL.linking.value.switch:true}")
|
||||
private boolean useLinkingValueSwitch;
|
||||
|
||||
@Value("${s2SQL.generation:TWO_PASS_AUTO_COT}")
|
||||
private SqlGenerationMode sqlGenerationMode;
|
||||
|
||||
@Value("${s2SQL.use.switch:true}")
|
||||
private boolean useS2SqlSwitch;
|
||||
|
||||
@Value("${text2sql.example.num:15}")
|
||||
private int text2sqlExampleNum;
|
||||
|
||||
@Value("${text2sql.fewShots.num:10}")
|
||||
private int text2sqlFewShotsNum;
|
||||
|
||||
@Value("${text2sql.self.consistency.num:5}")
|
||||
private int text2sqlSelfConsistencyNum;
|
||||
|
||||
@Value("${parse.show.count:3}")
|
||||
private Integer parseShowCount;
|
||||
|
||||
@Autowired
|
||||
private SysParameterService sysParameterService;
|
||||
|
||||
public Integer getOneDetectionSize() {
|
||||
return convertValue("one.detection.size", Integer.class, oneDetectionSize);
|
||||
}
|
||||
|
||||
public Integer getOneDetectionMaxSize() {
|
||||
return convertValue("one.detection.max.size", Integer.class, oneDetectionMaxSize);
|
||||
}
|
||||
|
||||
public Double getMetricDimensionMinThresholdConfig() {
|
||||
return convertValue("metric.dimension.min.threshold", Double.class, metricDimensionMinThresholdConfig);
|
||||
}
|
||||
|
||||
public Double getMetricDimensionThresholdConfig() {
|
||||
return convertValue("metric.dimension.threshold", Double.class, metricDimensionThresholdConfig);
|
||||
}
|
||||
|
||||
public Double getDimensionValueThresholdConfig() {
|
||||
return convertValue("dimension.value.threshold", Double.class, dimensionValueThresholdConfig);
|
||||
}
|
||||
|
||||
public Double getLongTextThreshold() {
|
||||
return convertValue("long.text.threshold", Double.class, longTextThreshold);
|
||||
}
|
||||
|
||||
public Double getShortTextThreshold() {
|
||||
return convertValue("short.text.threshold", Double.class, shortTextThreshold);
|
||||
}
|
||||
|
||||
public Integer getQueryTextLengthThreshold() {
|
||||
return convertValue("query.text.length.threshold", Integer.class, queryTextLengthThreshold);
|
||||
}
|
||||
|
||||
public boolean isUseS2SqlSwitch() {
|
||||
return convertValue("use.s2SQL.switch", Boolean.class, useS2SqlSwitch);
|
||||
}
|
||||
|
||||
public Integer getEmbeddingMapperWordMin() {
|
||||
return convertValue("embedding.mapper.word.min", Integer.class, embeddingMapperWordMin);
|
||||
}
|
||||
|
||||
public Integer getEmbeddingMapperWordMax() {
|
||||
return convertValue("embedding.mapper.word.max", Integer.class, embeddingMapperWordMax);
|
||||
}
|
||||
|
||||
public Integer getEmbeddingMapperBatch() {
|
||||
return convertValue("embedding.mapper.batch", Integer.class, embeddingMapperBatch);
|
||||
}
|
||||
|
||||
public Integer getEmbeddingMapperNumber() {
|
||||
return convertValue("embedding.mapper.number", Integer.class, embeddingMapperNumber);
|
||||
}
|
||||
|
||||
public Integer getEmbeddingMapperRoundNumber() {
|
||||
return convertValue("embedding.mapper.round.number", Integer.class, embeddingMapperRoundNumber);
|
||||
}
|
||||
|
||||
public Double getEmbeddingMapperDistanceThreshold() {
|
||||
return convertValue("embedding.mapper.distance.threshold", Double.class, embeddingMapperDistanceThreshold);
|
||||
}
|
||||
|
||||
public boolean isUseLinkingValueSwitch() {
|
||||
return convertValue("s2SQL.linking.value.switch", Boolean.class, useLinkingValueSwitch);
|
||||
}
|
||||
|
||||
public SqlGenerationMode getSqlGenerationMode() {
|
||||
return convertValue("s2SQL.generation", SqlGenerationMode.class, sqlGenerationMode);
|
||||
}
|
||||
|
||||
public Integer getParseShowCount() {
|
||||
return convertValue("parse.show.count", Integer.class, parseShowCount);
|
||||
}
|
||||
|
||||
public <T> T convertValue(String paramName, Class<T> targetType, T defaultValue) {
|
||||
try {
|
||||
String value = sysParameterService.getSysParameter().getParameterByName(paramName);
|
||||
if (StringUtils.isBlank(value)) {
|
||||
return defaultValue;
|
||||
}
|
||||
if (targetType == Double.class) {
|
||||
return targetType.cast(Double.parseDouble(value));
|
||||
} else if (targetType == Integer.class) {
|
||||
return targetType.cast(Integer.parseInt(value));
|
||||
} else if (targetType == Boolean.class) {
|
||||
return targetType.cast(Boolean.parseBoolean(value));
|
||||
} else if (targetType == SqlGenerationMode.class) {
|
||||
return targetType.cast(SqlGenerationMode.valueOf(value));
|
||||
}
|
||||
} catch (Exception e) {
|
||||
log.error("convertValue", e);
|
||||
}
|
||||
return defaultValue;
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,76 @@
|
||||
package com.tencent.supersonic.headless.core.knowledge;
|
||||
|
||||
import com.tencent.supersonic.common.pojo.enums.DictWordType;
|
||||
import com.tencent.supersonic.headless.api.pojo.response.S2Term;
|
||||
import com.tencent.supersonic.headless.core.knowledge.helper.HanlpHelper;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@Service
|
||||
@Slf4j
|
||||
public class KnowledgeService {
|
||||
|
||||
public void updateSemanticKnowledge(List<DictWord> natures) {
|
||||
|
||||
List<DictWord> prefixes = natures.stream()
|
||||
.filter(entry -> !entry.getNatureWithFrequency().contains(DictWordType.SUFFIX.getTypeWithSpilt()))
|
||||
.collect(Collectors.toList());
|
||||
|
||||
for (DictWord nature : prefixes) {
|
||||
HanlpHelper.addToCustomDictionary(nature);
|
||||
}
|
||||
|
||||
List<DictWord> suffixes = natures.stream()
|
||||
.filter(entry -> entry.getNatureWithFrequency().contains(DictWordType.SUFFIX.getTypeWithSpilt()))
|
||||
.collect(Collectors.toList());
|
||||
|
||||
SearchService.loadSuffix(suffixes);
|
||||
}
|
||||
|
||||
public void reloadAllData(List<DictWord> natures) {
|
||||
// 1. reload custom knowledge
|
||||
try {
|
||||
HanlpHelper.reloadCustomDictionary();
|
||||
} catch (Exception e) {
|
||||
log.error("reloadCustomDictionary error", e);
|
||||
}
|
||||
|
||||
// 2. update online knowledge
|
||||
updateOnlineKnowledge(natures);
|
||||
}
|
||||
|
||||
public void updateOnlineKnowledge(List<DictWord> natures) {
|
||||
try {
|
||||
updateSemanticKnowledge(natures);
|
||||
} catch (Exception e) {
|
||||
log.error("updateSemanticKnowledge error", e);
|
||||
}
|
||||
}
|
||||
|
||||
public List<S2Term> getTerms(String text, Map<Long, List<Long>> modelIdToDataSetIds) {
|
||||
return HanlpHelper.getTerms(text, modelIdToDataSetIds);
|
||||
}
|
||||
|
||||
public List<HanlpMapResult> prefixSearch(String key, int limit, Map<Long, List<Long>> modelIdToDataSetIds) {
|
||||
return prefixSearchByModel(key, limit, modelIdToDataSetIds);
|
||||
}
|
||||
|
||||
public List<HanlpMapResult> prefixSearchByModel(String key, int limit,
|
||||
Map<Long, List<Long>> modelIdToDataSetIds) {
|
||||
return SearchService.prefixSearch(key, limit, modelIdToDataSetIds);
|
||||
}
|
||||
|
||||
public List<HanlpMapResult> suffixSearch(String key, int limit, Map<Long, List<Long>> modelIdToDataSetIds) {
|
||||
return suffixSearchByModel(key, limit, modelIdToDataSetIds.keySet());
|
||||
}
|
||||
|
||||
public List<HanlpMapResult> suffixSearchByModel(String key, int limit, Set<Long> models) {
|
||||
return SearchService.suffixSearch(key, limit, models);
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,91 @@
|
||||
package com.tencent.supersonic.headless.core.knowledge;
|
||||
|
||||
import com.tencent.supersonic.common.config.EmbeddingConfig;
|
||||
import com.tencent.supersonic.common.pojo.Constants;
|
||||
import com.tencent.supersonic.common.util.ComponentFactory;
|
||||
import com.tencent.supersonic.common.util.embedding.Retrieval;
|
||||
import com.tencent.supersonic.common.util.embedding.RetrieveQuery;
|
||||
import com.tencent.supersonic.common.util.embedding.RetrieveQueryResult;
|
||||
import com.tencent.supersonic.common.util.embedding.S2EmbeddingStore;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.apache.commons.collections.CollectionUtils;
|
||||
import org.springframework.beans.BeanUtils;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@Service
|
||||
@Slf4j
|
||||
public class MetaEmbeddingService {
|
||||
|
||||
private S2EmbeddingStore s2EmbeddingStore = ComponentFactory.getS2EmbeddingStore();
|
||||
@Autowired
|
||||
private EmbeddingConfig embeddingConfig;
|
||||
|
||||
public List<RetrieveQueryResult> retrieveQuery(RetrieveQuery retrieveQuery, int num,
|
||||
Map<Long, List<Long>> modelIdToDataSetIds) {
|
||||
// dataSetIds->modelIds
|
||||
Set<Long> allModels = modelIdToDataSetIds.keySet();
|
||||
|
||||
if (CollectionUtils.isNotEmpty(allModels) && allModels.size() == 1) {
|
||||
Map<String, String> filterCondition = new HashMap<>();
|
||||
filterCondition.put("modelId", allModels.stream().findFirst().get().toString());
|
||||
retrieveQuery.setFilterCondition(filterCondition);
|
||||
}
|
||||
|
||||
String collectionName = embeddingConfig.getMetaCollectionName();
|
||||
List<RetrieveQueryResult> resultList = s2EmbeddingStore.retrieveQuery(collectionName, retrieveQuery, num);
|
||||
if (CollectionUtils.isEmpty(resultList)) {
|
||||
return new ArrayList<>();
|
||||
}
|
||||
//filter by modelId
|
||||
if (CollectionUtils.isEmpty(allModels)) {
|
||||
return resultList;
|
||||
}
|
||||
return resultList.stream()
|
||||
.map(retrieveQueryResult -> {
|
||||
List<Retrieval> retrievals = retrieveQueryResult.getRetrieval();
|
||||
if (CollectionUtils.isEmpty(retrievals)) {
|
||||
return retrieveQueryResult;
|
||||
}
|
||||
//filter by modelId
|
||||
retrievals.removeIf(retrieval -> {
|
||||
Long modelId = Retrieval.getLongId(retrieval.getMetadata().get("modelId"));
|
||||
if (Objects.isNull(modelId)) {
|
||||
return CollectionUtils.isEmpty(allModels);
|
||||
}
|
||||
return !allModels.contains(modelId);
|
||||
});
|
||||
//add dataSetId
|
||||
retrievals = retrievals.stream().flatMap(retrieval -> {
|
||||
Long modelId = Retrieval.getLongId(retrieval.getMetadata().get("modelId"));
|
||||
List<Long> dataSetIdsByModelId = modelIdToDataSetIds.get(modelId);
|
||||
if (!CollectionUtils.isEmpty(dataSetIdsByModelId)) {
|
||||
Set<Retrieval> result = new HashSet<>();
|
||||
for (Long dataSetId : dataSetIdsByModelId) {
|
||||
Retrieval retrievalNew = new Retrieval();
|
||||
BeanUtils.copyProperties(retrieval, retrievalNew);
|
||||
retrievalNew.getMetadata().putIfAbsent("dataSetId", dataSetId + Constants.UNDERLINE);
|
||||
result.add(retrievalNew);
|
||||
}
|
||||
return result.stream();
|
||||
}
|
||||
Set<Retrieval> result = new HashSet<>();
|
||||
result.add(retrieval);
|
||||
return result.stream();
|
||||
}).collect(Collectors.toList());
|
||||
retrieveQueryResult.setRetrieval(retrievals);
|
||||
return retrieveQueryResult;
|
||||
})
|
||||
.filter(retrieveQueryResult -> CollectionUtils.isNotEmpty(retrieveQueryResult.getRetrieval()))
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
}
|
||||
@@ -7,9 +7,9 @@ import com.tencent.supersonic.headless.api.pojo.QueryParam;
|
||||
import com.tencent.supersonic.headless.api.pojo.enums.AggOption;
|
||||
import com.tencent.supersonic.headless.api.pojo.request.SqlExecuteReq;
|
||||
import com.tencent.supersonic.headless.core.parser.converter.HeadlessConverter;
|
||||
import com.tencent.supersonic.headless.core.pojo.DataSetQueryParam;
|
||||
import com.tencent.supersonic.headless.core.pojo.MetricQueryParam;
|
||||
import com.tencent.supersonic.headless.core.pojo.QueryStatement;
|
||||
import com.tencent.supersonic.headless.core.pojo.DataSetQueryParam;
|
||||
import com.tencent.supersonic.headless.core.utils.ComponentFactory;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
@@ -0,0 +1,14 @@
|
||||
package com.tencent.supersonic.headless.core.pojo;
|
||||
|
||||
|
||||
import com.tencent.supersonic.headless.api.pojo.SemanticParseInfo;
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
public class ChatContext {
|
||||
|
||||
private Integer chatId;
|
||||
private String queryText;
|
||||
private SemanticParseInfo parseInfo = new SemanticParseInfo();
|
||||
private String user;
|
||||
}
|
||||
@@ -0,0 +1,60 @@
|
||||
package com.tencent.supersonic.headless.core.pojo;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
import com.tencent.supersonic.auth.api.authentication.pojo.User;
|
||||
import com.tencent.supersonic.common.pojo.enums.QueryType;
|
||||
import com.tencent.supersonic.common.util.ContextUtils;
|
||||
import com.tencent.supersonic.headless.api.pojo.DataSetSchema;
|
||||
import com.tencent.supersonic.headless.api.pojo.SchemaMapInfo;
|
||||
import com.tencent.supersonic.headless.api.pojo.SemanticSchema;
|
||||
import com.tencent.supersonic.headless.api.pojo.request.QueryFilters;
|
||||
import com.tencent.supersonic.headless.core.config.OptimizationConfig;
|
||||
import com.tencent.supersonic.headless.core.chat.query.SemanticQuery;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
public class QueryContext {
|
||||
|
||||
private String queryText;
|
||||
private Integer chatId;
|
||||
private Set<Long> dataSetIds;
|
||||
private Map<Long, List<Long>> modelIdToDataSetIds;
|
||||
private User user;
|
||||
private boolean saveAnswer;
|
||||
private boolean enableLLM;
|
||||
private QueryFilters queryFilters;
|
||||
private List<SemanticQuery> candidateQueries = new ArrayList<>();
|
||||
private SchemaMapInfo mapInfo = new SchemaMapInfo();
|
||||
@JsonIgnore
|
||||
private SemanticSchema semanticSchema;
|
||||
|
||||
public List<SemanticQuery> getCandidateQueries() {
|
||||
OptimizationConfig optimizationConfig = ContextUtils.getBean(OptimizationConfig.class);
|
||||
Integer parseShowCount = optimizationConfig.getParseShowCount();
|
||||
candidateQueries = candidateQueries.stream()
|
||||
.sorted(Comparator.comparing(semanticQuery -> semanticQuery.getParseInfo().getScore(),
|
||||
Comparator.reverseOrder()))
|
||||
.limit(parseShowCount)
|
||||
.collect(Collectors.toList());
|
||||
return candidateQueries;
|
||||
}
|
||||
|
||||
public QueryType getQueryType(Long dataSetId) {
|
||||
SemanticSchema semanticSchema = this.semanticSchema;
|
||||
DataSetSchema dataSetSchema = semanticSchema.getDataSetSchemaMap().get(dataSetId);
|
||||
return dataSetSchema.getQueryType();
|
||||
}
|
||||
}
|
||||
@@ -1,26 +1,35 @@
|
||||
package com.tencent.supersonic.headless.core.utils;
|
||||
|
||||
import com.tencent.supersonic.common.util.ContextUtils;
|
||||
import com.tencent.supersonic.headless.core.chat.parser.JavaLLMProxy;
|
||||
import com.tencent.supersonic.headless.core.chat.parser.LLMProxy;
|
||||
import com.tencent.supersonic.headless.core.chat.parser.llm.DataSetResolver;
|
||||
import com.tencent.supersonic.headless.core.executor.JdbcExecutor;
|
||||
import com.tencent.supersonic.headless.core.executor.QueryExecutor;
|
||||
import com.tencent.supersonic.headless.core.parser.converter.SqlVariableParseConverter;
|
||||
import com.tencent.supersonic.headless.core.planner.DetailQueryOptimizer;
|
||||
import com.tencent.supersonic.headless.core.planner.QueryOptimizer;
|
||||
import com.tencent.supersonic.headless.core.parser.converter.HeadlessConverter;
|
||||
import com.tencent.supersonic.headless.core.parser.SqlParser;
|
||||
import com.tencent.supersonic.headless.core.parser.calcite.CalciteSqlParser;
|
||||
import com.tencent.supersonic.headless.core.parser.converter.CalculateAggConverter;
|
||||
import com.tencent.supersonic.headless.core.parser.converter.DefaultDimValueConverter;
|
||||
import com.tencent.supersonic.headless.core.parser.converter.HeadlessConverter;
|
||||
import com.tencent.supersonic.headless.core.parser.converter.ParserDefaultConverter;
|
||||
import com.tencent.supersonic.headless.core.parser.converter.SqlVariableParseConverter;
|
||||
import com.tencent.supersonic.headless.core.planner.DetailQueryOptimizer;
|
||||
import com.tencent.supersonic.headless.core.planner.QueryOptimizer;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.springframework.core.io.support.SpringFactoriesLoader;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/**
|
||||
* HeadlessConverter QueryOptimizer QueryExecutor object factory
|
||||
*/
|
||||
@Slf4j
|
||||
public class ComponentFactory {
|
||||
|
||||
private static List<HeadlessConverter> headlessConverters = new ArrayList<>();
|
||||
@@ -28,6 +37,9 @@ public class ComponentFactory {
|
||||
private static List<QueryExecutor> queryExecutors = new ArrayList<>();
|
||||
private static SqlParser sqlParser;
|
||||
|
||||
private static LLMProxy llmProxy;
|
||||
private static DataSetResolver modelResolver;
|
||||
|
||||
static {
|
||||
initSemanticConverter();
|
||||
initQueryOptimizer();
|
||||
@@ -89,4 +101,41 @@ public class ComponentFactory {
|
||||
headlessConverters.add(getBean("ParserDefaultConverter", ParserDefaultConverter.class));
|
||||
}
|
||||
|
||||
public static LLMProxy getLLMProxy() {
|
||||
//1.Preferentially retrieve from environment variables
|
||||
String llmProxyEnv = System.getenv("llmProxy");
|
||||
if (StringUtils.isNotBlank(llmProxyEnv)) {
|
||||
Map<String, LLMProxy> implementations = ContextUtils.getBeansOfType(LLMProxy.class);
|
||||
llmProxy = implementations.entrySet().stream()
|
||||
.filter(entry -> entry.getKey().equalsIgnoreCase(llmProxyEnv))
|
||||
.map(Map.Entry::getValue)
|
||||
.findFirst()
|
||||
.orElse(null);
|
||||
}
|
||||
//2.default JavaLLMProxy
|
||||
if (Objects.isNull(llmProxy)) {
|
||||
llmProxy = ContextUtils.getBean(JavaLLMProxy.class);
|
||||
}
|
||||
log.info("llmProxy:{}", llmProxy);
|
||||
return llmProxy;
|
||||
}
|
||||
|
||||
public static DataSetResolver getModelResolver() {
|
||||
if (Objects.isNull(modelResolver)) {
|
||||
modelResolver = init(DataSetResolver.class);
|
||||
}
|
||||
return modelResolver;
|
||||
}
|
||||
|
||||
private static <T> List<T> init(Class<T> factoryType, List list) {
|
||||
list.addAll(SpringFactoriesLoader.loadFactories(factoryType,
|
||||
Thread.currentThread().getContextClassLoader()));
|
||||
return list;
|
||||
}
|
||||
|
||||
private static <T> T init(Class<T> factoryType) {
|
||||
return SpringFactoriesLoader.loadFactories(factoryType,
|
||||
Thread.currentThread().getContextClassLoader()).get(0);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -0,0 +1,254 @@
|
||||
package com.tencent.supersonic.headless.core.utils;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
import com.tencent.supersonic.common.pojo.Aggregator;
|
||||
import com.tencent.supersonic.common.pojo.Constants;
|
||||
import com.tencent.supersonic.common.pojo.DateConf;
|
||||
import com.tencent.supersonic.common.pojo.Filter;
|
||||
import com.tencent.supersonic.common.pojo.Order;
|
||||
import com.tencent.supersonic.common.pojo.enums.AggOperatorEnum;
|
||||
import com.tencent.supersonic.common.pojo.enums.AggregateTypeEnum;
|
||||
import com.tencent.supersonic.common.pojo.enums.QueryType;
|
||||
import com.tencent.supersonic.common.pojo.enums.TimeDimensionEnum;
|
||||
import com.tencent.supersonic.headless.api.pojo.SchemaElement;
|
||||
import com.tencent.supersonic.headless.api.pojo.SemanticParseInfo;
|
||||
import com.tencent.supersonic.headless.api.pojo.request.QueryFilter;
|
||||
import com.tencent.supersonic.headless.api.pojo.request.QueryMultiStructReq;
|
||||
import com.tencent.supersonic.headless.api.pojo.request.QuerySqlReq;
|
||||
import com.tencent.supersonic.headless.api.pojo.request.QueryStructReq;
|
||||
import com.tencent.supersonic.headless.core.chat.query.QueryManager;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.logging.log4j.util.Strings;
|
||||
import org.springframework.beans.BeanUtils;
|
||||
import org.springframework.util.CollectionUtils;
|
||||
|
||||
import java.time.LocalDate;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashSet;
|
||||
import java.util.LinkedHashSet;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@Slf4j
|
||||
public class QueryReqBuilder {
|
||||
|
||||
public static QueryStructReq buildStructReq(SemanticParseInfo parseInfo) {
|
||||
QueryStructReq queryStructReq = new QueryStructReq();
|
||||
queryStructReq.setDataSetId(parseInfo.getDataSetId());
|
||||
queryStructReq.setDataSetName(parseInfo.getDataSet().getName());
|
||||
queryStructReq.setQueryType(parseInfo.getQueryType());
|
||||
queryStructReq.setDateInfo(rewrite2Between(parseInfo.getDateInfo()));
|
||||
|
||||
List<Filter> dimensionFilters = getFilters(parseInfo.getDimensionFilters());
|
||||
queryStructReq.setDimensionFilters(dimensionFilters);
|
||||
|
||||
List<Filter> metricFilters = parseInfo.getMetricFilters().stream()
|
||||
.map(chatFilter -> new Filter(chatFilter.getBizName(), chatFilter.getOperator(), chatFilter.getValue()))
|
||||
.collect(Collectors.toList());
|
||||
queryStructReq.setMetricFilters(metricFilters);
|
||||
|
||||
addDateDimension(parseInfo);
|
||||
List<String> dimensions = parseInfo.getDimensions().stream().map(SchemaElement::getBizName)
|
||||
.collect(Collectors.toList());
|
||||
queryStructReq.setGroups(dimensions);
|
||||
queryStructReq.setLimit(parseInfo.getLimit());
|
||||
// only one metric is queried at once
|
||||
Set<SchemaElement> metrics = parseInfo.getMetrics();
|
||||
if (!CollectionUtils.isEmpty(metrics)) {
|
||||
SchemaElement metricElement = parseInfo.getMetrics().iterator().next();
|
||||
Set<Order> order = getOrder(parseInfo.getOrders(), parseInfo.getAggType(), metricElement);
|
||||
queryStructReq.setAggregators(getAggregatorByMetric(parseInfo.getAggType(), metricElement));
|
||||
queryStructReq.setOrders(new ArrayList<>(order));
|
||||
}
|
||||
|
||||
deletionDuplicated(queryStructReq);
|
||||
|
||||
return queryStructReq;
|
||||
}
|
||||
|
||||
private static List<Filter> getFilters(Set<QueryFilter> queryFilters) {
|
||||
List<Filter> dimensionFilters = queryFilters.stream()
|
||||
.filter(chatFilter -> Strings.isNotEmpty(chatFilter.getBizName()))
|
||||
.map(chatFilter -> new Filter(chatFilter.getBizName(), chatFilter.getOperator(), chatFilter.getValue()))
|
||||
.collect(Collectors.toList());
|
||||
return dimensionFilters;
|
||||
}
|
||||
|
||||
private static void deletionDuplicated(QueryStructReq queryStructReq) {
|
||||
if (!CollectionUtils.isEmpty(queryStructReq.getGroups()) && queryStructReq.getGroups().size() > 1) {
|
||||
Set<String> groups = new HashSet<>();
|
||||
groups.addAll(queryStructReq.getGroups());
|
||||
queryStructReq.getGroups().clear();
|
||||
queryStructReq.getGroups().addAll(groups);
|
||||
}
|
||||
}
|
||||
|
||||
private static DateConf rewrite2Between(DateConf dateInfo) {
|
||||
DateConf dateInfoNew = new DateConf();
|
||||
BeanUtils.copyProperties(dateInfo, dateInfoNew);
|
||||
if (Objects.nonNull(dateInfo) && DateConf.DateMode.RECENT.equals(dateInfo.getDateMode())) {
|
||||
int unit = dateInfo.getUnit();
|
||||
int days = 1;
|
||||
switch (dateInfo.getPeriod()) {
|
||||
case Constants.DAY:
|
||||
days = 1;
|
||||
break;
|
||||
case Constants.WEEK:
|
||||
days = 7;
|
||||
break;
|
||||
case Constants.MONTH:
|
||||
days = 30;
|
||||
break;
|
||||
case Constants.YEAR:
|
||||
days = 365;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
String startDate = LocalDate.now().plusDays(-(unit * days)).toString();
|
||||
String endDate = LocalDate.now().plusDays(-1).toString();
|
||||
dateInfoNew.setDateMode(DateConf.DateMode.BETWEEN);
|
||||
dateInfoNew.setStartDate(startDate);
|
||||
dateInfoNew.setEndDate(endDate);
|
||||
}
|
||||
return dateInfoNew;
|
||||
}
|
||||
|
||||
public static QueryMultiStructReq buildMultiStructReq(SemanticParseInfo parseInfo) {
|
||||
QueryStructReq queryStructReq = buildStructReq(parseInfo);
|
||||
QueryMultiStructReq queryMultiStructReq = new QueryMultiStructReq();
|
||||
List<QueryStructReq> queryStructReqs = Lists.newArrayList();
|
||||
for (Filter dimensionFilter : queryStructReq.getDimensionFilters()) {
|
||||
QueryStructReq req = new QueryStructReq();
|
||||
BeanUtils.copyProperties(queryStructReq, req);
|
||||
req.setDataSetId(parseInfo.getDataSetId());
|
||||
req.setDimensionFilters(Lists.newArrayList(dimensionFilter));
|
||||
queryStructReqs.add(req);
|
||||
}
|
||||
queryMultiStructReq.setQueryStructReqs(queryStructReqs);
|
||||
return queryMultiStructReq;
|
||||
}
|
||||
|
||||
/**
|
||||
* convert to QueryS2SQLReq
|
||||
*
|
||||
* @param querySql
|
||||
* @param dataSetId
|
||||
* @return
|
||||
*/
|
||||
public static QuerySqlReq buildS2SQLReq(String querySql, Long dataSetId) {
|
||||
QuerySqlReq querySQLReq = new QuerySqlReq();
|
||||
if (Objects.nonNull(querySql)) {
|
||||
querySQLReq.setSql(querySql);
|
||||
}
|
||||
querySQLReq.setDataSetId(dataSetId);
|
||||
return querySQLReq;
|
||||
}
|
||||
|
||||
private static List<Aggregator> getAggregatorByMetric(AggregateTypeEnum aggregateType, SchemaElement metric) {
|
||||
List<Aggregator> aggregators = new ArrayList<>();
|
||||
if (metric != null) {
|
||||
String agg = "";
|
||||
if (Objects.isNull(aggregateType) || aggregateType.equals(AggregateTypeEnum.NONE)
|
||||
|| AggOperatorEnum.COUNT_DISTINCT.name().equalsIgnoreCase(metric.getDefaultAgg())) {
|
||||
if (StringUtils.isNotBlank(metric.getDefaultAgg())) {
|
||||
agg = metric.getDefaultAgg();
|
||||
}
|
||||
} else {
|
||||
agg = aggregateType.name();
|
||||
}
|
||||
aggregators.add(new Aggregator(metric.getBizName(), AggOperatorEnum.of(agg)));
|
||||
}
|
||||
return aggregators;
|
||||
}
|
||||
|
||||
private static void addDateDimension(SemanticParseInfo parseInfo) {
|
||||
if (parseInfo != null) {
|
||||
String queryMode = parseInfo.getQueryMode();
|
||||
if (parseInfo.getDateInfo() == null) {
|
||||
return;
|
||||
}
|
||||
if (parseInfo.getAggType() != null && (parseInfo.getAggType().equals(AggregateTypeEnum.MAX)
|
||||
|| parseInfo.getAggType().equals(AggregateTypeEnum.MIN)) && !CollectionUtils.isEmpty(
|
||||
parseInfo.getDimensions())) {
|
||||
return;
|
||||
}
|
||||
DateConf dateInfo = parseInfo.getDateInfo();
|
||||
String dateField = getDateField(dateInfo);
|
||||
|
||||
for (SchemaElement dimension : parseInfo.getDimensions()) {
|
||||
if (dimension.getBizName().equalsIgnoreCase(dateField)) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
if (Objects.nonNull(parseInfo.getAggType()) && !parseInfo.getAggType().equals(AggregateTypeEnum.NONE)) {
|
||||
return;
|
||||
}
|
||||
|
||||
SchemaElement dimension = new SchemaElement();
|
||||
dimension.setBizName(dateField);
|
||||
|
||||
if (QueryManager.isMetricQuery(queryMode)) {
|
||||
List<String> timeDimensions = Arrays.asList(TimeDimensionEnum.DAY.getName(),
|
||||
TimeDimensionEnum.WEEK.getName(), TimeDimensionEnum.MONTH.getName());
|
||||
Set<SchemaElement> dimensions = parseInfo.getDimensions().stream()
|
||||
.filter(d -> !timeDimensions.contains(d.getBizName().toLowerCase())).collect(
|
||||
Collectors.toSet());
|
||||
dimensions.add(dimension);
|
||||
parseInfo.setDimensions(dimensions);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public static Set<Order> getOrder(Set<Order> parseOrder, AggregateTypeEnum aggregator, SchemaElement metric) {
|
||||
if (!CollectionUtils.isEmpty(parseOrder)) {
|
||||
return parseOrder;
|
||||
}
|
||||
Set<Order> orders = new LinkedHashSet();
|
||||
if (metric == null) {
|
||||
return orders;
|
||||
}
|
||||
|
||||
if ((AggregateTypeEnum.TOPN.equals(aggregator) || AggregateTypeEnum.MAX.equals(aggregator)
|
||||
|| AggregateTypeEnum.MIN.equals(
|
||||
aggregator))) {
|
||||
Order order = new Order();
|
||||
order.setColumn(metric.getBizName());
|
||||
order.setDirection("desc");
|
||||
orders.add(order);
|
||||
}
|
||||
return orders;
|
||||
}
|
||||
|
||||
public static String getDateField(DateConf dateConf) {
|
||||
if (Objects.isNull(dateConf)) {
|
||||
return "";
|
||||
}
|
||||
String dateField = TimeDimensionEnum.DAY.getName();
|
||||
if (Constants.MONTH.equals(dateConf.getPeriod())) {
|
||||
dateField = TimeDimensionEnum.MONTH.getName();
|
||||
}
|
||||
if (Constants.WEEK.equals(dateConf.getPeriod())) {
|
||||
dateField = TimeDimensionEnum.WEEK.getName();
|
||||
}
|
||||
return dateField;
|
||||
}
|
||||
|
||||
public static QueryStructReq buildStructRatioReq(SemanticParseInfo parseInfo, SchemaElement metric,
|
||||
AggOperatorEnum aggOperatorEnum) {
|
||||
QueryStructReq queryStructReq = buildStructReq(parseInfo);
|
||||
queryStructReq.setQueryType(QueryType.METRIC);
|
||||
queryStructReq.setOrders(new ArrayList<>());
|
||||
List<Aggregator> aggregators = new ArrayList<>();
|
||||
Aggregator ratioRoll = new Aggregator(metric.getBizName(), aggOperatorEnum);
|
||||
aggregators.add(ratioRoll);
|
||||
queryStructReq.setAggregators(aggregators);
|
||||
return queryStructReq;
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,69 @@
|
||||
package com.tencent.supersonic.headless.core.utils;
|
||||
|
||||
import com.tencent.supersonic.common.pojo.enums.QueryType;
|
||||
import com.tencent.supersonic.common.pojo.enums.TimeMode;
|
||||
import com.tencent.supersonic.common.util.DatePeriodEnum;
|
||||
import com.tencent.supersonic.common.util.DateUtils;
|
||||
import com.tencent.supersonic.headless.api.pojo.DataSetSchema;
|
||||
import com.tencent.supersonic.headless.api.pojo.TimeDefaultConfig;
|
||||
import com.tencent.supersonic.headless.core.pojo.QueryContext;
|
||||
import org.apache.commons.lang3.tuple.Pair;
|
||||
|
||||
import java.util.Objects;
|
||||
|
||||
public class S2SqlDateHelper {
|
||||
|
||||
public static String getReferenceDate(QueryContext queryContext, Long dataSetId) {
|
||||
String defaultDate = DateUtils.getBeforeDate(0);
|
||||
if (Objects.isNull(dataSetId)) {
|
||||
return defaultDate;
|
||||
}
|
||||
DataSetSchema dataSetSchema = queryContext.getSemanticSchema().getDataSetSchemaMap().get(dataSetId);
|
||||
if (dataSetSchema == null || dataSetSchema.getTagTypeTimeDefaultConfig() == null) {
|
||||
return defaultDate;
|
||||
}
|
||||
TimeDefaultConfig tagTypeTimeDefaultConfig = dataSetSchema.getTagTypeTimeDefaultConfig();
|
||||
return getDefaultDate(defaultDate, tagTypeTimeDefaultConfig).getLeft();
|
||||
}
|
||||
|
||||
public static Pair<String, String> getStartEndDate(QueryContext queryContext, Long dataSetId,
|
||||
QueryType queryType) {
|
||||
String defaultDate = DateUtils.getBeforeDate(0);
|
||||
if (Objects.isNull(dataSetId)) {
|
||||
return Pair.of(defaultDate, defaultDate);
|
||||
}
|
||||
DataSetSchema dataSetSchema = queryContext.getSemanticSchema().getDataSetSchemaMap().get(dataSetId);
|
||||
if (dataSetSchema == null) {
|
||||
return Pair.of(defaultDate, defaultDate);
|
||||
}
|
||||
TimeDefaultConfig defaultConfig = dataSetSchema.getMetricTypeTimeDefaultConfig();
|
||||
if (QueryType.TAG.equals(queryType)) {
|
||||
defaultConfig = dataSetSchema.getTagTypeTimeDefaultConfig();
|
||||
}
|
||||
return getDefaultDate(defaultDate, defaultConfig);
|
||||
}
|
||||
|
||||
private static Pair<String, String> getDefaultDate(String defaultDate, TimeDefaultConfig defaultConfig) {
|
||||
if (Objects.isNull(defaultConfig)) {
|
||||
return Pair.of(null, null);
|
||||
}
|
||||
Integer unit = defaultConfig.getUnit();
|
||||
String period = defaultConfig.getPeriod();
|
||||
TimeMode timeMode = defaultConfig.getTimeMode();
|
||||
if (Objects.nonNull(unit)) {
|
||||
// If the unit is set to less than 0, then do not add relative date.
|
||||
if (unit < 0) {
|
||||
return Pair.of(null, null);
|
||||
}
|
||||
DatePeriodEnum datePeriodEnum = DatePeriodEnum.get(period);
|
||||
String startDate = DateUtils.getBeforeDate(unit, datePeriodEnum);
|
||||
String endDate = DateUtils.getBeforeDate(1, datePeriodEnum);
|
||||
if (TimeMode.LAST.equals(timeMode)) {
|
||||
endDate = startDate;
|
||||
}
|
||||
return Pair.of(startDate, endDate);
|
||||
}
|
||||
return Pair.of(defaultDate, defaultDate);
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,45 @@
|
||||
package com.tencent.supersonic.headless.core.utils;
|
||||
|
||||
import com.tencent.supersonic.headless.api.pojo.SchemaElementMatch;
|
||||
import com.tencent.supersonic.headless.api.pojo.SchemaElementType;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.springframework.util.CollectionUtils;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/**
|
||||
* Schema Match Helper
|
||||
*/
|
||||
@Slf4j
|
||||
public class SchemaMatchHelper {
|
||||
|
||||
public static void removeSamePrefixDetectWord(List<SchemaElementMatch> matches) {
|
||||
if (CollectionUtils.isEmpty(matches)) {
|
||||
return;
|
||||
}
|
||||
|
||||
Set<String> metricDimensionDetectWordSet = matches.stream()
|
||||
.filter(SchemaMatchHelper::isMetricOrDimension)
|
||||
.map(SchemaElementMatch::getDetectWord).collect(Collectors.toSet());
|
||||
|
||||
matches.removeIf(elementMatch -> {
|
||||
if (!isMetricOrDimension(elementMatch)) {
|
||||
return false;
|
||||
}
|
||||
for (String detectWord : metricDimensionDetectWordSet) {
|
||||
if (detectWord.startsWith(elementMatch.getDetectWord())
|
||||
&& detectWord.length() > elementMatch.getDetectWord().length()) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
});
|
||||
}
|
||||
|
||||
private static boolean isMetricOrDimension(SchemaElementMatch elementMatch) {
|
||||
return SchemaElementType.METRIC.equals(elementMatch.getElement().getType())
|
||||
|| SchemaElementType.DIMENSION.equals(elementMatch.getElement().getType());
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,145 @@
|
||||
package com.tencent.supersonic.chat.core.corrector;
|
||||
|
||||
import com.fasterxml.jackson.core.JsonProcessingException;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.tencent.supersonic.common.pojo.Constants;
|
||||
import com.tencent.supersonic.headless.api.pojo.DataSetSchema;
|
||||
import com.tencent.supersonic.headless.api.pojo.QueryConfig;
|
||||
import com.tencent.supersonic.headless.api.pojo.SchemaElement;
|
||||
import com.tencent.supersonic.headless.api.pojo.SemanticParseInfo;
|
||||
import com.tencent.supersonic.headless.api.pojo.SemanticSchema;
|
||||
import com.tencent.supersonic.headless.api.pojo.SqlInfo;
|
||||
import com.tencent.supersonic.headless.core.chat.corrector.SchemaCorrector;
|
||||
import com.tencent.supersonic.headless.core.chat.parser.llm.ParseResult;
|
||||
import com.tencent.supersonic.headless.core.pojo.QueryContext;
|
||||
import com.tencent.supersonic.headless.core.chat.query.llm.s2sql.LLMReq;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import static org.testng.Assert.assertEquals;
|
||||
|
||||
class SchemaCorrectorTest {
|
||||
|
||||
private String json = "{\n"
|
||||
+ " \"dataSetId\": 1,\n"
|
||||
+ " \"llmReq\": {\n"
|
||||
+ " \"queryText\": \"xxx2024年播放量最高的十首歌\",\n"
|
||||
+ " \"filterCondition\": {\n"
|
||||
+ " \"tableName\": null\n"
|
||||
+ " },\n"
|
||||
+ " \"schema\": {\n"
|
||||
+ " \"domainName\": \"歌曲\",\n"
|
||||
+ " \"dataSetName\": \"歌曲\",\n"
|
||||
+ " \"fieldNameList\": [\n"
|
||||
+ " \"商务组\",\n"
|
||||
+ " \"歌曲名\",\n"
|
||||
+ " \"播放量\",\n"
|
||||
+ " \"播放份额\",\n"
|
||||
+ " \"数据日期\"\n"
|
||||
+ " ]\n"
|
||||
+ " },\n"
|
||||
+ " \"linking\": [\n"
|
||||
+ "\n"
|
||||
+ " ],\n"
|
||||
+ " \"currentDate\": \"2024-02-24\",\n"
|
||||
+ " \"priorExts\": \"播放份额是小数; \",\n"
|
||||
+ " \"sqlGenerationMode\": \"2_pass_auto_cot\"\n"
|
||||
+ " },\n"
|
||||
+ " \"request\": null,\n"
|
||||
+ " \"commonAgentTool\": {\n"
|
||||
+ " \"id\": \"y3LqVSRL\",\n"
|
||||
+ " \"name\": \"大模型语义解析\",\n"
|
||||
+ " \"type\": \"NL2SQL_LLM\",\n"
|
||||
+ " \"dataSetIds\": [\n"
|
||||
+ " 1\n"
|
||||
+ " ]\n"
|
||||
+ " },\n"
|
||||
+ " \"linkingValues\": [\n"
|
||||
+ "\n"
|
||||
+ " ]\n"
|
||||
+ "}";
|
||||
|
||||
@Test
|
||||
void doCorrect() throws JsonProcessingException {
|
||||
Long dataSetId = 1L;
|
||||
QueryContext queryContext = buildQueryContext(dataSetId);
|
||||
ObjectMapper objectMapper = new ObjectMapper();
|
||||
ParseResult parseResult = objectMapper.readValue(json, ParseResult.class);
|
||||
|
||||
|
||||
String sql = "select 歌曲名 from 歌曲 where 发行日期 >= '2024-01-01' "
|
||||
+ "and 商务组 = 'xxx' order by 播放量 desc limit 10";
|
||||
SemanticParseInfo semanticParseInfo = new SemanticParseInfo();
|
||||
SqlInfo sqlInfo = new SqlInfo();
|
||||
sqlInfo.setS2SQL(sql);
|
||||
sqlInfo.setCorrectS2SQL(sql);
|
||||
semanticParseInfo.setSqlInfo(sqlInfo);
|
||||
|
||||
SchemaElement schemaElement = new SchemaElement();
|
||||
schemaElement.setDataSet(dataSetId);
|
||||
semanticParseInfo.setDataSet(schemaElement);
|
||||
|
||||
|
||||
semanticParseInfo.getProperties().put(Constants.CONTEXT, parseResult);
|
||||
|
||||
SchemaCorrector schemaCorrector = new SchemaCorrector();
|
||||
schemaCorrector.removeFilterIfNotInLinkingValue(queryContext, semanticParseInfo);
|
||||
|
||||
assertEquals("SELECT 歌曲名 FROM 歌曲 WHERE 发行日期 >= '2024-01-01' "
|
||||
+ "ORDER BY 播放量 DESC LIMIT 10", semanticParseInfo.getSqlInfo().getCorrectS2SQL());
|
||||
|
||||
parseResult = objectMapper.readValue(json, ParseResult.class);
|
||||
|
||||
List<LLMReq.ElementValue> linkingValues = new ArrayList<>();
|
||||
LLMReq.ElementValue elementValue = new LLMReq.ElementValue();
|
||||
elementValue.setFieldName("商务组");
|
||||
elementValue.setFieldValue("xxx");
|
||||
linkingValues.add(elementValue);
|
||||
parseResult.setLinkingValues(linkingValues);
|
||||
semanticParseInfo.getProperties().put(Constants.CONTEXT, parseResult);
|
||||
|
||||
semanticParseInfo.getSqlInfo().setCorrectS2SQL(sql);
|
||||
semanticParseInfo.getSqlInfo().setS2SQL(sql);
|
||||
schemaCorrector.removeFilterIfNotInLinkingValue(queryContext, semanticParseInfo);
|
||||
assertEquals("SELECT 歌曲名 FROM 歌曲 WHERE 发行日期 >= '2024-01-01' "
|
||||
+ "AND 商务组 = 'xxx' ORDER BY 播放量 DESC LIMIT 10", semanticParseInfo.getSqlInfo().getCorrectS2SQL());
|
||||
|
||||
}
|
||||
|
||||
private QueryContext buildQueryContext(Long dataSetId) {
|
||||
QueryContext queryContext = new QueryContext();
|
||||
List<DataSetSchema> dataSetSchemaList = new ArrayList<>();
|
||||
DataSetSchema dataSetSchema = new DataSetSchema();
|
||||
QueryConfig queryConfig = new QueryConfig();
|
||||
dataSetSchema.setQueryConfig(queryConfig);
|
||||
SchemaElement schemaElement = new SchemaElement();
|
||||
schemaElement.setDataSet(dataSetId);
|
||||
dataSetSchema.setDataSet(schemaElement);
|
||||
Set<SchemaElement> dimensions = new HashSet<>();
|
||||
SchemaElement element1 = new SchemaElement();
|
||||
element1.setDataSet(1L);
|
||||
element1.setName("歌曲名");
|
||||
dimensions.add(element1);
|
||||
|
||||
SchemaElement element2 = new SchemaElement();
|
||||
element2.setDataSet(1L);
|
||||
element2.setName("商务组");
|
||||
dimensions.add(element2);
|
||||
|
||||
SchemaElement element3 = new SchemaElement();
|
||||
element3.setDataSet(1L);
|
||||
element3.setName("发行日期");
|
||||
dimensions.add(element3);
|
||||
|
||||
dataSetSchema.setDimensions(dimensions);
|
||||
dataSetSchemaList.add(dataSetSchema);
|
||||
|
||||
SemanticSchema semanticSchema = new SemanticSchema(dataSetSchemaList);
|
||||
queryContext.setSemanticSchema(semanticSchema);
|
||||
return queryContext;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,101 @@
|
||||
package com.tencent.supersonic.chat.core.corrector;
|
||||
|
||||
import com.tencent.supersonic.headless.api.pojo.SemanticParseInfo;
|
||||
import com.tencent.supersonic.headless.api.pojo.SqlInfo;
|
||||
import com.tencent.supersonic.headless.core.chat.corrector.TimeCorrector;
|
||||
import com.tencent.supersonic.headless.core.pojo.QueryContext;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.testng.Assert;
|
||||
|
||||
class TimeCorrectorTest {
|
||||
|
||||
@Test
|
||||
void testDoCorrect() {
|
||||
TimeCorrector corrector = new TimeCorrector();
|
||||
QueryContext queryContext = new QueryContext();
|
||||
SemanticParseInfo semanticParseInfo = new SemanticParseInfo();
|
||||
SqlInfo sqlInfo = new SqlInfo();
|
||||
//1.数据日期 <=
|
||||
String sql = "SELECT 维度1, SUM(播放量) FROM 数据库 "
|
||||
+ "WHERE (歌手名 = '张三') AND 数据日期 <= '2023-11-17' GROUP BY 维度1";
|
||||
sqlInfo.setCorrectS2SQL(sql);
|
||||
semanticParseInfo.setSqlInfo(sqlInfo);
|
||||
corrector.doCorrect(queryContext, semanticParseInfo);
|
||||
|
||||
Assert.assertEquals(
|
||||
"SELECT 维度1, SUM(播放量) FROM 数据库 WHERE ((歌手名 = '张三') AND 数据日期 <= '2023-11-17') "
|
||||
+ "AND 数据日期 >= '2023-11-17' GROUP BY 维度1",
|
||||
sqlInfo.getCorrectS2SQL());
|
||||
|
||||
//2.数据日期 <
|
||||
sql = "SELECT 维度1, SUM(播放量) FROM 数据库 "
|
||||
+ "WHERE (歌手名 = '张三') AND 数据日期 < '2023-11-17' GROUP BY 维度1";
|
||||
sqlInfo.setCorrectS2SQL(sql);
|
||||
corrector.doCorrect(queryContext, semanticParseInfo);
|
||||
|
||||
Assert.assertEquals(
|
||||
"SELECT 维度1, SUM(播放量) FROM 数据库 WHERE ((歌手名 = '张三') AND 数据日期 < '2023-11-17') "
|
||||
+ "AND 数据日期 >= '2023-11-17' GROUP BY 维度1",
|
||||
sqlInfo.getCorrectS2SQL());
|
||||
|
||||
//3.数据日期 >=
|
||||
sql = "SELECT 维度1, SUM(播放量) FROM 数据库 "
|
||||
+ "WHERE (歌手名 = '张三') AND 数据日期 >= '2023-11-17' GROUP BY 维度1";
|
||||
sqlInfo.setCorrectS2SQL(sql);
|
||||
corrector.doCorrect(queryContext, semanticParseInfo);
|
||||
|
||||
Assert.assertEquals(
|
||||
"SELECT 维度1, SUM(播放量) FROM 数据库 "
|
||||
+ "WHERE (歌手名 = '张三') AND 数据日期 >= '2023-11-17' GROUP BY 维度1",
|
||||
sqlInfo.getCorrectS2SQL());
|
||||
|
||||
//4.数据日期 >
|
||||
sql = "SELECT 维度1, SUM(播放量) FROM 数据库 "
|
||||
+ "WHERE (歌手名 = '张三') AND 数据日期 > '2023-11-17' GROUP BY 维度1";
|
||||
sqlInfo.setCorrectS2SQL(sql);
|
||||
corrector.doCorrect(queryContext, semanticParseInfo);
|
||||
|
||||
Assert.assertEquals(
|
||||
"SELECT 维度1, SUM(播放量) FROM 数据库 "
|
||||
+ "WHERE (歌手名 = '张三') AND 数据日期 > '2023-11-17' GROUP BY 维度1",
|
||||
sqlInfo.getCorrectS2SQL());
|
||||
|
||||
//5.no 数据日期
|
||||
sql = "SELECT 维度1, SUM(播放量) FROM 数据库 "
|
||||
+ "WHERE 歌手名 = '张三' GROUP BY 维度1";
|
||||
sqlInfo.setCorrectS2SQL(sql);
|
||||
corrector.doCorrect(queryContext, semanticParseInfo);
|
||||
|
||||
Assert.assertEquals(
|
||||
"SELECT 维度1, SUM(播放量) FROM 数据库 WHERE 歌手名 = '张三' GROUP BY 维度1",
|
||||
sqlInfo.getCorrectS2SQL());
|
||||
|
||||
//6. 数据日期-月 <=
|
||||
sql = "SELECT 维度1, SUM(播放量) FROM 数据库 "
|
||||
+ "WHERE 歌手名 = '张三' AND 数据日期_月 <= '2024-01' GROUP BY 维度1";
|
||||
sqlInfo.setCorrectS2SQL(sql);
|
||||
corrector.doCorrect(queryContext, semanticParseInfo);
|
||||
|
||||
Assert.assertEquals(
|
||||
"SELECT 维度1, SUM(播放量) FROM 数据库 WHERE (歌手名 = '张三' AND 数据日期_月 <= '2024-01') "
|
||||
+ "AND 数据日期_月 >= '2024-01' GROUP BY 维度1",
|
||||
sqlInfo.getCorrectS2SQL());
|
||||
|
||||
//7. 数据日期-月 >
|
||||
sql = "SELECT 维度1, SUM(播放量) FROM 数据库 "
|
||||
+ "WHERE 歌手名 = '张三' AND 数据日期_月 > '2024-01' GROUP BY 维度1";
|
||||
sqlInfo.setCorrectS2SQL(sql);
|
||||
corrector.doCorrect(queryContext, semanticParseInfo);
|
||||
|
||||
Assert.assertEquals(
|
||||
"SELECT 维度1, SUM(播放量) FROM 数据库 "
|
||||
+ "WHERE 歌手名 = '张三' AND 数据日期_月 > '2024-01' GROUP BY 维度1",
|
||||
sqlInfo.getCorrectS2SQL());
|
||||
|
||||
//8. no where
|
||||
sql = "SELECT COUNT(1) FROM 数据库";
|
||||
sqlInfo.setCorrectS2SQL(sql);
|
||||
corrector.doCorrect(queryContext, semanticParseInfo);
|
||||
Assert.assertEquals("SELECT COUNT(1) FROM 数据库", sqlInfo.getCorrectS2SQL());
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
package com.tencent.supersonic.chat.core.mapper;
|
||||
|
||||
|
||||
import com.hankcs.hanlp.algorithm.EditDistance;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
class LoadRemoveServiceTest {
|
||||
|
||||
@Test
|
||||
void edit() {
|
||||
int compute = EditDistance.compute("在", "在你的身边");
|
||||
Assertions.assertEquals(compute, 4);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,37 @@
|
||||
package com.tencent.supersonic.chat.core.parser.aggregate;
|
||||
|
||||
import com.tencent.supersonic.common.pojo.enums.AggregateTypeEnum;
|
||||
import com.tencent.supersonic.headless.core.chat.parser.rule.AggregateTypeParser;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.testng.Assert;
|
||||
|
||||
class AggregateTypeParserTest {
|
||||
|
||||
@Test
|
||||
void getAggregateParser() {
|
||||
AggregateTypeParser aggregateParser = new AggregateTypeParser();
|
||||
AggregateTypeEnum aggregateType = aggregateParser.resolveAggregateType("supsersonic产品访问次数最大值");
|
||||
Assert.assertEquals(aggregateType, AggregateTypeEnum.MAX);
|
||||
|
||||
aggregateType = aggregateParser.resolveAggregateType("supsersonic产品pv");
|
||||
Assert.assertEquals(aggregateType, AggregateTypeEnum.COUNT);
|
||||
|
||||
aggregateType = aggregateParser.resolveAggregateType("supsersonic产品uv");
|
||||
Assert.assertEquals(aggregateType, AggregateTypeEnum.DISTINCT);
|
||||
|
||||
aggregateType = aggregateParser.resolveAggregateType("supsersonic产品访问次数最大值");
|
||||
Assert.assertEquals(aggregateType, AggregateTypeEnum.MAX);
|
||||
|
||||
aggregateType = aggregateParser.resolveAggregateType("supsersonic产品访问次数最小值");
|
||||
Assert.assertEquals(aggregateType, AggregateTypeEnum.MIN);
|
||||
|
||||
aggregateType = aggregateParser.resolveAggregateType("supsersonic产品访问次数平均值");
|
||||
Assert.assertEquals(aggregateType, AggregateTypeEnum.AVG);
|
||||
|
||||
aggregateType = aggregateParser.resolveAggregateType("supsersonic产品访问次数topN");
|
||||
Assert.assertEquals(aggregateType, AggregateTypeEnum.TOPN);
|
||||
|
||||
aggregateType = aggregateParser.resolveAggregateType("supsersonic产品访问次数汇总");
|
||||
Assert.assertEquals(aggregateType, AggregateTypeEnum.SUM);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,56 @@
|
||||
package com.tencent.supersonic.chat.core.s2sql;
|
||||
|
||||
import com.tencent.supersonic.headless.core.chat.parser.llm.LLMResponseService;
|
||||
import com.tencent.supersonic.headless.core.chat.query.llm.s2sql.LLMResp;
|
||||
import com.tencent.supersonic.headless.core.chat.query.llm.s2sql.LLMSqlResp;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.testng.Assert;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
class LLMResponseServiceTest {
|
||||
|
||||
@Test
|
||||
void deduplicationSqlWeight() {
|
||||
String sql1 = "SELECT a,b,c,d FROM table1 WHERE column1 = 1 AND column2 = 2 order by a";
|
||||
String sql2 = "SELECT d,c,b,a FROM table1 WHERE column2 = 2 AND column1 = 1 order by a";
|
||||
|
||||
LLMResp llmResp = new LLMResp();
|
||||
Map<String, LLMSqlResp> sqlWeight = new HashMap<>();
|
||||
sqlWeight.put(sql1, LLMSqlResp.builder().sqlWeight(0.20).build());
|
||||
sqlWeight.put(sql2, LLMSqlResp.builder().sqlWeight(0.80).build());
|
||||
|
||||
llmResp.setSqlRespMap(sqlWeight);
|
||||
LLMResponseService llmResponseService = new LLMResponseService();
|
||||
Map<String, LLMSqlResp> deduplicationSqlResp = llmResponseService.getDeduplicationSqlResp(llmResp);
|
||||
|
||||
Assert.assertEquals(deduplicationSqlResp.size(), 1);
|
||||
|
||||
sql1 = "SELECT a,b,c,d FROM table1 WHERE column1 = 1 AND column2 = 2 order by a";
|
||||
sql2 = "SELECT d,c,b,a FROM table1 WHERE column2 = 2 AND column1 = 1 order by a";
|
||||
|
||||
LLMResp llmResp2 = new LLMResp();
|
||||
Map<String, LLMSqlResp> sqlWeight2 = new HashMap<>();
|
||||
sqlWeight2.put(sql1, LLMSqlResp.builder().sqlWeight(0.20).build());
|
||||
sqlWeight2.put(sql2, LLMSqlResp.builder().sqlWeight(0.80).build());
|
||||
|
||||
llmResp2.setSqlRespMap(sqlWeight2);
|
||||
deduplicationSqlResp = llmResponseService.getDeduplicationSqlResp(llmResp2);
|
||||
|
||||
Assert.assertEquals(deduplicationSqlResp.size(), 1);
|
||||
|
||||
sql1 = "SELECT a,b,c,d,e FROM table1 WHERE column1 = 1 AND column2 = 2 order by a";
|
||||
sql2 = "SELECT d,c,b,a FROM table1 WHERE column2 = 2 AND column1 = 1 order by a";
|
||||
|
||||
LLMResp llmResp3 = new LLMResp();
|
||||
Map<String, LLMSqlResp> sqlWeight3 = new HashMap<>();
|
||||
sqlWeight3.put(sql1, LLMSqlResp.builder().sqlWeight(0.20).build());
|
||||
sqlWeight3.put(sql2, LLMSqlResp.builder().sqlWeight(0.80).build());
|
||||
llmResp3.setSqlRespMap(sqlWeight3);
|
||||
deduplicationSqlResp = llmResponseService.getDeduplicationSqlResp(llmResp3);
|
||||
|
||||
Assert.assertEquals(deduplicationSqlResp.size(), 2);
|
||||
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,56 @@
|
||||
package com.tencent.supersonic.chat.core.s2sql;
|
||||
|
||||
import com.tencent.supersonic.headless.api.pojo.SchemaElement;
|
||||
import com.tencent.supersonic.headless.api.pojo.SchemaValueMap;
|
||||
import com.tencent.supersonic.headless.api.pojo.SemanticSchema;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.mockito.Mockito;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
import static org.mockito.Mockito.when;
|
||||
|
||||
class LLMSqlParserTest {
|
||||
|
||||
@Test
|
||||
void setFilter() {
|
||||
SemanticSchema mockSemanticSchema = Mockito.mock(SemanticSchema.class);
|
||||
|
||||
List<SchemaElement> dimensions = new ArrayList<>();
|
||||
List<SchemaValueMap> schemaValueMaps = new ArrayList<>();
|
||||
SchemaValueMap value1 = new SchemaValueMap();
|
||||
value1.setBizName("杰伦");
|
||||
value1.setTechName("周杰伦");
|
||||
value1.setAlias(Arrays.asList("周杰倫", "Jay Chou", "周董", "周先生"));
|
||||
schemaValueMaps.add(value1);
|
||||
|
||||
SchemaElement schemaElement = SchemaElement.builder()
|
||||
.bizName("singer_name")
|
||||
.name("歌手名")
|
||||
.dataSet(2L)
|
||||
.schemaValueMaps(schemaValueMaps)
|
||||
.build();
|
||||
dimensions.add(schemaElement);
|
||||
|
||||
SchemaElement schemaElement2 = SchemaElement.builder()
|
||||
.bizName("publish_time")
|
||||
.name("发布时间")
|
||||
.dataSet(2L)
|
||||
.build();
|
||||
dimensions.add(schemaElement2);
|
||||
|
||||
when(mockSemanticSchema.getDimensions()).thenReturn(dimensions);
|
||||
|
||||
List<SchemaElement> metrics = new ArrayList<>();
|
||||
SchemaElement metric = SchemaElement.builder()
|
||||
.bizName("play_count")
|
||||
.name("播放量")
|
||||
.dataSet(2L)
|
||||
.build();
|
||||
metrics.add(metric);
|
||||
|
||||
when(mockSemanticSchema.getMetrics()).thenReturn(metrics);
|
||||
}
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user