[improvement]Use QueryWrapper in place of hard-coded SQLs (#1944)
Some checks are pending
supersonic CentOS CI / build (11) (push) Waiting to run
supersonic CentOS CI / build (21) (push) Waiting to run
supersonic CentOS CI / build (8) (push) Waiting to run
supersonic mac CI / build (11) (push) Waiting to run
supersonic mac CI / build (21) (push) Waiting to run
supersonic mac CI / build (8) (push) Waiting to run
supersonic ubuntu CI / build (11) (push) Waiting to run
supersonic ubuntu CI / build (21) (push) Waiting to run
supersonic ubuntu CI / build (8) (push) Waiting to run
supersonic windows CI / build (11) (push) Waiting to run
supersonic windows CI / build (21) (push) Waiting to run
supersonic windows CI / build (8) (push) Waiting to run

* [improvement][launcher]Use API to get element ID avoiding hard-code.

* [fix][launcher]Fix mysql scripts.

* [improvement][launcher]Support DuckDB database and refactor translator code structure.

* [improvement][headless-fe] Revamped the interaction for semantic modeling routing and successfully implemented the switching between dimension and dataset management.

* [improvement][Headless] Add table ddl in Dbschema

* [improvement][Headless] Add get database by type

* [improvement][Headless] Supports automatic batch creation of models based on db table names.

* [improvement][Headless] Supports getting domain by bizName

* [improvement][launcher]Refactor unit tests and demo data.

* [fix][launcher]Change default vector dimension to 512.

* [improvement](Dict) add dimValueAliasMap info for KnowledgeBaseService

* [improvement][headless]Use QueryWrapper to replace hard-code SQL in mapper xml.

* [improvement][chat]Introduce ChatMemory to delegate ChatMemoryDO.

* [fix][common]Fix embedding store sys configs.

* [fix][common]Fix postgres schema, using varchar instead of char.

* [improvement][launcher]Change supersonic docker deployment from mysql to postgres.

* [Fix][launcher]Fix a number of issues related to semantic modeling.

* [Fix][headless]Fix the evaluation logic of agg type.

* [fix][assembly]Fix Dockerfile and add docker compose run script.

* [fix][chat]Fix "multiple assignments to same column "similar_queries".

* [improvement][headless]Use LamdaQueryWrapper to avoid hard-coded column names.

* [improvement][headless]Refactor headless infra to support advanced semantic modelling.

* [improvement][headless]Change class name `Dim` to `Dimension`.

* [improvement][chat]Introduce `TimeFieldMapper` to always map time field.

* [fix][headless]Remove unnecessary dimension existence check.

* [fix][chat]Fix adjusted filters don't take effect.

---------
This commit is contained in:
Jun Zhang
2024-12-08 13:32:29 +08:00
committed by GitHub
parent 0fc29304a8
commit e55f43c737
120 changed files with 844 additions and 5810 deletions

View File

@@ -19,6 +19,7 @@ public class DictWord {
private String word;
private String nature;
private String natureWithFrequency;
private String alias;
@Override
public boolean equals(Object o) {

View File

@@ -1,11 +1,14 @@
package com.tencent.supersonic.headless.chat.knowledge;
import com.baomidou.mybatisplus.core.toolkit.CollectionUtils;
import com.tencent.supersonic.common.pojo.enums.DictWordType;
import com.tencent.supersonic.headless.api.pojo.response.S2Term;
import com.tencent.supersonic.headless.chat.knowledge.helper.HanlpHelper;
import lombok.extern.slf4j.Slf4j;
import org.springframework.stereotype.Service;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
@@ -14,6 +17,31 @@ import java.util.stream.Collectors;
@Service
@Slf4j
public class KnowledgeBaseService {
private static volatile Map<Long, List<DictWord>> dimValueAliasMap = new HashMap<>();
public static Map<Long, List<DictWord>> getDimValueAlias() {
return dimValueAliasMap;
}
public static List<DictWord> addDimValueAlias(Long dimId, List<DictWord> newWords) {
List<DictWord> dimValueAlias =
dimValueAliasMap.containsKey(dimId) ? dimValueAliasMap.get(dimId)
: new ArrayList<>();
Set<String> wordSet =
dimValueAlias
.stream().map(word -> String.format("%s_%s_%s",
word.getNatureWithFrequency(), word.getWord(), word.getAlias()))
.collect(Collectors.toSet());
for (DictWord dictWord : newWords) {
String key = String.format("%s_%s_%s", dictWord.getNatureWithFrequency(),
dictWord.getWord(), dictWord.getAlias());
if (!wordSet.contains(key)) {
dimValueAlias.add(dictWord);
}
}
dimValueAliasMap.put(dimId, dimValueAlias);
return dimValueAlias;
}
public void updateSemanticKnowledge(List<DictWord> natures) {
@@ -41,6 +69,11 @@ public class KnowledgeBaseService {
}
// 2. update online knowledge
if (CollectionUtils.isNotEmpty(dimValueAliasMap)) {
for (Long dimId : dimValueAliasMap.keySet()) {
natures.addAll(dimValueAliasMap.get(dimId));
}
}
updateOnlineKnowledge(natures);
}

View File

@@ -12,6 +12,8 @@ import com.hankcs.hanlp.dictionary.other.CharTable;
import com.hankcs.hanlp.seg.common.Term;
import com.hankcs.hanlp.utility.LexiconUtility;
import com.hankcs.hanlp.utility.TextUtility;
import com.tencent.supersonic.common.pojo.Constants;
import com.tencent.supersonic.common.pojo.enums.DictWordType;
import com.tencent.supersonic.headless.chat.knowledge.helper.HanlpHelper;
import java.io.BufferedOutputStream;
@@ -103,7 +105,22 @@ public class MultiCustomDictionary extends DynamicCustomDictionary {
String word = getWordBySpace(param[0]);
if (isLetters) {
original = word;
word = word.toLowerCase();
// word = word.toLowerCase();
// 加入小写别名
if (!original.equals(word.toLowerCase())) {
DictWord dictWord = new DictWord();
String nature = param[1];
dictWord.setNatureWithFrequency(
String.format("%s " + Constants.DEFAULT_FREQUENCY, nature));
dictWord.setWord(word);
dictWord.setAlias(word.toLowerCase());
String[] split = nature.split(DictWordType.NATURE_SPILT);
if (split.length >= 2) {
Long dimId = Long.parseLong(
nature.split(DictWordType.NATURE_SPILT)[split.length - 1]);
KnowledgeBaseService.addDimValueAlias(dimId, Arrays.asList(dictWord));
}
}
}
if (natureCount == 0) {
attribute = new CoreDictionary.Attribute(defaultNature);

View File

@@ -8,12 +8,15 @@ import com.tencent.supersonic.headless.api.pojo.SchemaMapInfo;
import com.tencent.supersonic.headless.api.pojo.response.S2Term;
import com.tencent.supersonic.headless.chat.ChatQueryContext;
import com.tencent.supersonic.headless.chat.knowledge.DatabaseMapResult;
import com.tencent.supersonic.headless.chat.knowledge.DictWord;
import com.tencent.supersonic.headless.chat.knowledge.HanlpMapResult;
import com.tencent.supersonic.headless.chat.knowledge.KnowledgeBaseService;
import com.tencent.supersonic.headless.chat.knowledge.builder.BaseWordBuilder;
import com.tencent.supersonic.headless.chat.knowledge.helper.HanlpHelper;
import com.tencent.supersonic.headless.chat.knowledge.helper.NatureHelper;
import com.tencent.supersonic.headless.chat.utils.EditDistanceUtils;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import org.springframework.util.CollectionUtils;
import java.util.HashSet;
@@ -83,12 +86,32 @@ public class KeywordMapper extends BaseMapper {
.element(element).frequency(frequency).word(hanlpMapResult.getName())
.similarity(hanlpMapResult.getSimilarity())
.detectWord(hanlpMapResult.getDetectWord()).build();
// doDimValueAliasLogic 将维度值别名进行替换成真实维度值
doDimValueAliasLogic(schemaElementMatch);
addToSchemaMap(chatQueryContext.getMapInfo(), dataSetId, schemaElementMatch);
}
}
}
private void doDimValueAliasLogic(SchemaElementMatch schemaElementMatch) {
SchemaElement element = schemaElementMatch.getElement();
if (SchemaElementType.VALUE.equals(element.getType())) {
Long dimId = element.getId();
String word = schemaElementMatch.getWord();
Map<Long, List<DictWord>> dimValueAlias = KnowledgeBaseService.getDimValueAlias();
if (Objects.nonNull(dimId) && StringUtils.isNotEmpty(word)
&& dimValueAlias.containsKey(dimId)) {
Map<String, DictWord> aliasAndDictMap = dimValueAlias.get(dimId).stream()
.collect(Collectors.toMap(dictWord -> dictWord.getAlias(),
dictWord -> dictWord, (v1, v2) -> v2));
if (aliasAndDictMap.containsKey(word)) {
String wordTech = aliasAndDictMap.get(word).getWord();
schemaElementMatch.setWord(wordTech);
}
}
}
}
private void convertMapResultToMapInfo(ChatQueryContext chatQueryContext,
List<DatabaseMapResult> mapResults) {
for (DatabaseMapResult match : mapResults) {

View File

@@ -0,0 +1,37 @@
package com.tencent.supersonic.headless.chat.mapper;
import com.tencent.supersonic.common.pojo.enums.Text2SQLType;
import com.tencent.supersonic.headless.api.pojo.DataSetSchema;
import com.tencent.supersonic.headless.api.pojo.SchemaElement;
import com.tencent.supersonic.headless.api.pojo.SchemaElementMatch;
import com.tencent.supersonic.headless.chat.ChatQueryContext;
import lombok.extern.slf4j.Slf4j;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
@Slf4j
public class TimeFieldMapper extends BaseMapper {
@Override
public void doMap(ChatQueryContext chatQueryContext) {
if (chatQueryContext.getRequest().getText2SQLType().equals(Text2SQLType.ONLY_RULE)) {
return;
}
Map<Long, DataSetSchema> schemaMap =
chatQueryContext.getSemanticSchema().getDataSetSchemaMap();
for (Map.Entry<Long, DataSetSchema> entry : schemaMap.entrySet()) {
List<SchemaElement> timeDims = entry.getValue().getDimensions().stream()
.filter(dim -> dim.getTimeFormat() != null).collect(Collectors.toList());
for (SchemaElement schemaElement : timeDims) {
chatQueryContext.getMapInfo().getMatchedElements(entry.getKey())
.add(SchemaElementMatch.builder().word(schemaElement.getName())
.element(schemaElement).detectWord(schemaElement.getName())
.similarity(1.0).build());
}
}
}
}