mirror of
https://github.com/tencentmusic/supersonic.git
synced 2025-12-12 12:37:55 +00:00
[improvement]Use QueryWrapper in place of hard-coded SQLs (#1944)
Some checks are pending
supersonic CentOS CI / build (11) (push) Waiting to run
supersonic CentOS CI / build (21) (push) Waiting to run
supersonic CentOS CI / build (8) (push) Waiting to run
supersonic mac CI / build (11) (push) Waiting to run
supersonic mac CI / build (21) (push) Waiting to run
supersonic mac CI / build (8) (push) Waiting to run
supersonic ubuntu CI / build (11) (push) Waiting to run
supersonic ubuntu CI / build (21) (push) Waiting to run
supersonic ubuntu CI / build (8) (push) Waiting to run
supersonic windows CI / build (11) (push) Waiting to run
supersonic windows CI / build (21) (push) Waiting to run
supersonic windows CI / build (8) (push) Waiting to run
Some checks are pending
supersonic CentOS CI / build (11) (push) Waiting to run
supersonic CentOS CI / build (21) (push) Waiting to run
supersonic CentOS CI / build (8) (push) Waiting to run
supersonic mac CI / build (11) (push) Waiting to run
supersonic mac CI / build (21) (push) Waiting to run
supersonic mac CI / build (8) (push) Waiting to run
supersonic ubuntu CI / build (11) (push) Waiting to run
supersonic ubuntu CI / build (21) (push) Waiting to run
supersonic ubuntu CI / build (8) (push) Waiting to run
supersonic windows CI / build (11) (push) Waiting to run
supersonic windows CI / build (21) (push) Waiting to run
supersonic windows CI / build (8) (push) Waiting to run
* [improvement][launcher]Use API to get element ID avoiding hard-code. * [fix][launcher]Fix mysql scripts. * [improvement][launcher]Support DuckDB database and refactor translator code structure. * [improvement][headless-fe] Revamped the interaction for semantic modeling routing and successfully implemented the switching between dimension and dataset management. * [improvement][Headless] Add table ddl in Dbschema * [improvement][Headless] Add get database by type * [improvement][Headless] Supports automatic batch creation of models based on db table names. * [improvement][Headless] Supports getting domain by bizName * [improvement][launcher]Refactor unit tests and demo data. * [fix][launcher]Change default vector dimension to 512. * [improvement](Dict) add dimValueAliasMap info for KnowledgeBaseService * [improvement][headless]Use QueryWrapper to replace hard-code SQL in mapper xml. * [improvement][chat]Introduce ChatMemory to delegate ChatMemoryDO. * [fix][common]Fix embedding store sys configs. * [fix][common]Fix postgres schema, using varchar instead of char. * [improvement][launcher]Change supersonic docker deployment from mysql to postgres. * [Fix][launcher]Fix a number of issues related to semantic modeling. * [Fix][headless]Fix the evaluation logic of agg type. * [fix][assembly]Fix Dockerfile and add docker compose run script. * [fix][chat]Fix "multiple assignments to same column "similar_queries". * [improvement][headless]Use LamdaQueryWrapper to avoid hard-coded column names. * [improvement][headless]Refactor headless infra to support advanced semantic modelling. * [improvement][headless]Change class name `Dim` to `Dimension`. * [improvement][chat]Introduce `TimeFieldMapper` to always map time field. * [fix][headless]Remove unnecessary dimension existence check. * [fix][chat]Fix adjusted filters don't take effect. ---------
This commit is contained in:
@@ -19,6 +19,7 @@ public class DictWord {
|
||||
private String word;
|
||||
private String nature;
|
||||
private String natureWithFrequency;
|
||||
private String alias;
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
|
||||
@@ -1,11 +1,14 @@
|
||||
package com.tencent.supersonic.headless.chat.knowledge;
|
||||
|
||||
import com.baomidou.mybatisplus.core.toolkit.CollectionUtils;
|
||||
import com.tencent.supersonic.common.pojo.enums.DictWordType;
|
||||
import com.tencent.supersonic.headless.api.pojo.response.S2Term;
|
||||
import com.tencent.supersonic.headless.chat.knowledge.helper.HanlpHelper;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
@@ -14,6 +17,31 @@ import java.util.stream.Collectors;
|
||||
@Service
|
||||
@Slf4j
|
||||
public class KnowledgeBaseService {
|
||||
private static volatile Map<Long, List<DictWord>> dimValueAliasMap = new HashMap<>();
|
||||
|
||||
public static Map<Long, List<DictWord>> getDimValueAlias() {
|
||||
return dimValueAliasMap;
|
||||
}
|
||||
|
||||
public static List<DictWord> addDimValueAlias(Long dimId, List<DictWord> newWords) {
|
||||
List<DictWord> dimValueAlias =
|
||||
dimValueAliasMap.containsKey(dimId) ? dimValueAliasMap.get(dimId)
|
||||
: new ArrayList<>();
|
||||
Set<String> wordSet =
|
||||
dimValueAlias
|
||||
.stream().map(word -> String.format("%s_%s_%s",
|
||||
word.getNatureWithFrequency(), word.getWord(), word.getAlias()))
|
||||
.collect(Collectors.toSet());
|
||||
for (DictWord dictWord : newWords) {
|
||||
String key = String.format("%s_%s_%s", dictWord.getNatureWithFrequency(),
|
||||
dictWord.getWord(), dictWord.getAlias());
|
||||
if (!wordSet.contains(key)) {
|
||||
dimValueAlias.add(dictWord);
|
||||
}
|
||||
}
|
||||
dimValueAliasMap.put(dimId, dimValueAlias);
|
||||
return dimValueAlias;
|
||||
}
|
||||
|
||||
public void updateSemanticKnowledge(List<DictWord> natures) {
|
||||
|
||||
@@ -41,6 +69,11 @@ public class KnowledgeBaseService {
|
||||
}
|
||||
|
||||
// 2. update online knowledge
|
||||
if (CollectionUtils.isNotEmpty(dimValueAliasMap)) {
|
||||
for (Long dimId : dimValueAliasMap.keySet()) {
|
||||
natures.addAll(dimValueAliasMap.get(dimId));
|
||||
}
|
||||
}
|
||||
updateOnlineKnowledge(natures);
|
||||
}
|
||||
|
||||
|
||||
@@ -12,6 +12,8 @@ import com.hankcs.hanlp.dictionary.other.CharTable;
|
||||
import com.hankcs.hanlp.seg.common.Term;
|
||||
import com.hankcs.hanlp.utility.LexiconUtility;
|
||||
import com.hankcs.hanlp.utility.TextUtility;
|
||||
import com.tencent.supersonic.common.pojo.Constants;
|
||||
import com.tencent.supersonic.common.pojo.enums.DictWordType;
|
||||
import com.tencent.supersonic.headless.chat.knowledge.helper.HanlpHelper;
|
||||
|
||||
import java.io.BufferedOutputStream;
|
||||
@@ -103,7 +105,22 @@ public class MultiCustomDictionary extends DynamicCustomDictionary {
|
||||
String word = getWordBySpace(param[0]);
|
||||
if (isLetters) {
|
||||
original = word;
|
||||
word = word.toLowerCase();
|
||||
// word = word.toLowerCase();
|
||||
// 加入小写别名
|
||||
if (!original.equals(word.toLowerCase())) {
|
||||
DictWord dictWord = new DictWord();
|
||||
String nature = param[1];
|
||||
dictWord.setNatureWithFrequency(
|
||||
String.format("%s " + Constants.DEFAULT_FREQUENCY, nature));
|
||||
dictWord.setWord(word);
|
||||
dictWord.setAlias(word.toLowerCase());
|
||||
String[] split = nature.split(DictWordType.NATURE_SPILT);
|
||||
if (split.length >= 2) {
|
||||
Long dimId = Long.parseLong(
|
||||
nature.split(DictWordType.NATURE_SPILT)[split.length - 1]);
|
||||
KnowledgeBaseService.addDimValueAlias(dimId, Arrays.asList(dictWord));
|
||||
}
|
||||
}
|
||||
}
|
||||
if (natureCount == 0) {
|
||||
attribute = new CoreDictionary.Attribute(defaultNature);
|
||||
|
||||
@@ -8,12 +8,15 @@ import com.tencent.supersonic.headless.api.pojo.SchemaMapInfo;
|
||||
import com.tencent.supersonic.headless.api.pojo.response.S2Term;
|
||||
import com.tencent.supersonic.headless.chat.ChatQueryContext;
|
||||
import com.tencent.supersonic.headless.chat.knowledge.DatabaseMapResult;
|
||||
import com.tencent.supersonic.headless.chat.knowledge.DictWord;
|
||||
import com.tencent.supersonic.headless.chat.knowledge.HanlpMapResult;
|
||||
import com.tencent.supersonic.headless.chat.knowledge.KnowledgeBaseService;
|
||||
import com.tencent.supersonic.headless.chat.knowledge.builder.BaseWordBuilder;
|
||||
import com.tencent.supersonic.headless.chat.knowledge.helper.HanlpHelper;
|
||||
import com.tencent.supersonic.headless.chat.knowledge.helper.NatureHelper;
|
||||
import com.tencent.supersonic.headless.chat.utils.EditDistanceUtils;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.springframework.util.CollectionUtils;
|
||||
|
||||
import java.util.HashSet;
|
||||
@@ -83,12 +86,32 @@ public class KeywordMapper extends BaseMapper {
|
||||
.element(element).frequency(frequency).word(hanlpMapResult.getName())
|
||||
.similarity(hanlpMapResult.getSimilarity())
|
||||
.detectWord(hanlpMapResult.getDetectWord()).build();
|
||||
|
||||
// doDimValueAliasLogic 将维度值别名进行替换成真实维度值
|
||||
doDimValueAliasLogic(schemaElementMatch);
|
||||
addToSchemaMap(chatQueryContext.getMapInfo(), dataSetId, schemaElementMatch);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void doDimValueAliasLogic(SchemaElementMatch schemaElementMatch) {
|
||||
SchemaElement element = schemaElementMatch.getElement();
|
||||
if (SchemaElementType.VALUE.equals(element.getType())) {
|
||||
Long dimId = element.getId();
|
||||
String word = schemaElementMatch.getWord();
|
||||
Map<Long, List<DictWord>> dimValueAlias = KnowledgeBaseService.getDimValueAlias();
|
||||
if (Objects.nonNull(dimId) && StringUtils.isNotEmpty(word)
|
||||
&& dimValueAlias.containsKey(dimId)) {
|
||||
Map<String, DictWord> aliasAndDictMap = dimValueAlias.get(dimId).stream()
|
||||
.collect(Collectors.toMap(dictWord -> dictWord.getAlias(),
|
||||
dictWord -> dictWord, (v1, v2) -> v2));
|
||||
if (aliasAndDictMap.containsKey(word)) {
|
||||
String wordTech = aliasAndDictMap.get(word).getWord();
|
||||
schemaElementMatch.setWord(wordTech);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void convertMapResultToMapInfo(ChatQueryContext chatQueryContext,
|
||||
List<DatabaseMapResult> mapResults) {
|
||||
for (DatabaseMapResult match : mapResults) {
|
||||
|
||||
@@ -0,0 +1,37 @@
|
||||
package com.tencent.supersonic.headless.chat.mapper;
|
||||
|
||||
import com.tencent.supersonic.common.pojo.enums.Text2SQLType;
|
||||
import com.tencent.supersonic.headless.api.pojo.DataSetSchema;
|
||||
import com.tencent.supersonic.headless.api.pojo.SchemaElement;
|
||||
import com.tencent.supersonic.headless.api.pojo.SchemaElementMatch;
|
||||
import com.tencent.supersonic.headless.chat.ChatQueryContext;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@Slf4j
|
||||
public class TimeFieldMapper extends BaseMapper {
|
||||
|
||||
@Override
|
||||
public void doMap(ChatQueryContext chatQueryContext) {
|
||||
if (chatQueryContext.getRequest().getText2SQLType().equals(Text2SQLType.ONLY_RULE)) {
|
||||
return;
|
||||
}
|
||||
|
||||
Map<Long, DataSetSchema> schemaMap =
|
||||
chatQueryContext.getSemanticSchema().getDataSetSchemaMap();
|
||||
for (Map.Entry<Long, DataSetSchema> entry : schemaMap.entrySet()) {
|
||||
List<SchemaElement> timeDims = entry.getValue().getDimensions().stream()
|
||||
.filter(dim -> dim.getTimeFormat() != null).collect(Collectors.toList());
|
||||
for (SchemaElement schemaElement : timeDims) {
|
||||
chatQueryContext.getMapInfo().getMatchedElements(entry.getKey())
|
||||
.add(SchemaElementMatch.builder().word(schemaElement.getName())
|
||||
.element(schemaElement).detectWord(schemaElement.getName())
|
||||
.similarity(1.0).build());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
Reference in New Issue
Block a user