mirror of
https://github.com/tencentmusic/supersonic.git
synced 2026-04-28 03:14:18 +08:00
Compare commits
7 Commits
6a2b54002a
...
4d0483cdd4
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
4d0483cdd4 | ||
|
|
1e01f3ef60 | ||
|
|
1155ac10d8 | ||
|
|
5a22590661 | ||
|
|
fc67411618 | ||
|
|
aaf2d46a56 | ||
|
|
c8abea9c1a |
@@ -1,10 +1,6 @@
|
||||
package com.tencent.supersonic.headless.api.pojo;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.ToString;
|
||||
import lombok.*;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
@@ -21,6 +17,7 @@ public class SchemaElementMatch implements Serializable {
|
||||
private String word;
|
||||
private Long frequency;
|
||||
private boolean isInherited;
|
||||
private boolean llmMatched;
|
||||
|
||||
public boolean isFullMatched() {
|
||||
return 1.0 == similarity;
|
||||
|
||||
@@ -13,6 +13,7 @@ public class EmbeddingResult extends MapResult {
|
||||
|
||||
private String id;
|
||||
private Map<String, String> metadata;
|
||||
private boolean llmMatched;
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
|
||||
@@ -1,9 +1,12 @@
|
||||
package com.tencent.supersonic.headless.chat.mapper;
|
||||
|
||||
import com.tencent.supersonic.common.pojo.enums.Text2SQLType;
|
||||
import com.tencent.supersonic.common.util.ContextUtils;
|
||||
import com.tencent.supersonic.common.util.JsonUtil;
|
||||
import com.tencent.supersonic.headless.api.pojo.SchemaElement;
|
||||
import com.tencent.supersonic.headless.api.pojo.SchemaElementMatch;
|
||||
import com.tencent.supersonic.headless.api.pojo.SchemaElementType;
|
||||
import com.tencent.supersonic.headless.api.pojo.SchemaMapInfo;
|
||||
import com.tencent.supersonic.headless.api.pojo.enums.MapModeEnum;
|
||||
import com.tencent.supersonic.headless.chat.ChatQueryContext;
|
||||
import com.tencent.supersonic.headless.chat.knowledge.EmbeddingResult;
|
||||
@@ -11,6 +14,7 @@ import com.tencent.supersonic.headless.chat.knowledge.builder.BaseWordBuilder;
|
||||
import com.tencent.supersonic.headless.chat.knowledge.helper.HanlpHelper;
|
||||
import dev.langchain4j.store.embedding.Retrieval;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.springframework.util.CollectionUtils;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
@@ -23,10 +27,16 @@ public class EmbeddingMapper extends BaseMapper {
|
||||
|
||||
@Override
|
||||
public boolean accept(ChatQueryContext chatQueryContext) {
|
||||
return MapModeEnum.LOOSE.equals(chatQueryContext.getRequest().getMapModeEnum());
|
||||
boolean b0 = MapModeEnum.LOOSE.equals(chatQueryContext.getRequest().getMapModeEnum());
|
||||
boolean b1 = chatQueryContext.getRequest().getText2SQLType() == Text2SQLType.LLM_OR_RULE;
|
||||
return b0 || b1;
|
||||
}
|
||||
|
||||
public void doMap(ChatQueryContext chatQueryContext) {
|
||||
|
||||
// TODO: 如果是在LOOSE执行过了,那么在LLM_OR_RULE阶段可以不用执行,所以这里缺乏一个状态来传递,暂时先忽略这个浪费行为吧
|
||||
SchemaMapInfo mappedInfo = chatQueryContext.getMapInfo();
|
||||
|
||||
// 1. Query from embedding by queryText
|
||||
EmbeddingMatchStrategy matchStrategy = ContextUtils.getBean(EmbeddingMatchStrategy.class);
|
||||
List<EmbeddingResult> matchResults = getMatches(chatQueryContext, matchStrategy);
|
||||
@@ -53,15 +63,26 @@ public class EmbeddingMapper extends BaseMapper {
|
||||
continue;
|
||||
}
|
||||
|
||||
|
||||
// Build SchemaElementMatch object
|
||||
SchemaElementMatch schemaElementMatch = SchemaElementMatch.builder()
|
||||
.element(schemaElement).frequency(BaseWordBuilder.DEFAULT_FREQUENCY)
|
||||
.word(matchResult.getName()).similarity(matchResult.getSimilarity())
|
||||
.detectWord(matchResult.getDetectWord()).build();
|
||||
schemaElementMatch.setLlmMatched(matchResult.isLlmMatched());
|
||||
|
||||
// 3. Add SchemaElementMatch to mapInfo
|
||||
addToSchemaMap(chatQueryContext.getMapInfo(), dataSetId, schemaElementMatch);
|
||||
}
|
||||
if (CollectionUtils.isEmpty(matchResults)) {
|
||||
log.info("embedding mapper no match");
|
||||
} else {
|
||||
for (EmbeddingResult matchResult : matchResults) {
|
||||
log.info("embedding match name=[{}],detectWord=[{}],similarity=[{}],metadata=[{}]",
|
||||
matchResult.getName(), matchResult.getDetectWord(),
|
||||
matchResult.getSimilarity(), JsonUtil.toString(matchResult.getMetadata()));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -1,9 +1,17 @@
|
||||
package com.tencent.supersonic.headless.chat.mapper;
|
||||
|
||||
import com.alibaba.fastjson.JSONObject;
|
||||
import com.google.common.collect.Lists;
|
||||
import com.hankcs.hanlp.seg.common.Term;
|
||||
import com.tencent.supersonic.headless.api.pojo.response.S2Term;
|
||||
import com.tencent.supersonic.headless.chat.ChatQueryContext;
|
||||
import com.tencent.supersonic.headless.chat.knowledge.EmbeddingResult;
|
||||
import com.tencent.supersonic.headless.chat.knowledge.MetaEmbeddingService;
|
||||
import com.tencent.supersonic.headless.chat.knowledge.helper.HanlpHelper;
|
||||
import dev.langchain4j.model.chat.ChatLanguageModel;
|
||||
import dev.langchain4j.model.input.Prompt;
|
||||
import dev.langchain4j.model.input.PromptTemplate;
|
||||
import dev.langchain4j.provider.ModelProvider;
|
||||
import dev.langchain4j.store.embedding.Retrieval;
|
||||
import dev.langchain4j.store.embedding.RetrieveQuery;
|
||||
import dev.langchain4j.store.embedding.RetrieveQueryResult;
|
||||
@@ -14,18 +22,12 @@ import org.springframework.beans.BeanUtils;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.*;
|
||||
import java.util.concurrent.Callable;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import static com.tencent.supersonic.headless.chat.mapper.MapperConfig.EMBEDDING_MAPPER_NUMBER;
|
||||
import static com.tencent.supersonic.headless.chat.mapper.MapperConfig.EMBEDDING_MAPPER_ROUND_NUMBER;
|
||||
import static com.tencent.supersonic.headless.chat.mapper.MapperConfig.EMBEDDING_MAPPER_THRESHOLD;
|
||||
import static com.tencent.supersonic.headless.chat.mapper.MapperConfig.*;
|
||||
|
||||
/**
|
||||
* EmbeddingMatchStrategy uses vector database to perform similarity search against the embeddings
|
||||
@@ -35,37 +37,165 @@ import static com.tencent.supersonic.headless.chat.mapper.MapperConfig.EMBEDDING
|
||||
@Slf4j
|
||||
public class EmbeddingMatchStrategy extends BatchMatchStrategy<EmbeddingResult> {
|
||||
|
||||
@Autowired
|
||||
protected MapperConfig mapperConfig;
|
||||
|
||||
@Autowired
|
||||
private MetaEmbeddingService metaEmbeddingService;
|
||||
|
||||
private static final String LLM_FILTER_PROMPT =
|
||||
"""
|
||||
\
|
||||
#Role: You are a professional data analyst specializing in metrics and dimensions.
|
||||
#Task: Given a user query and a list of retrieved metrics/dimensions through vector recall,
|
||||
please analyze which metrics/dimensions the user is most likely interested in.
|
||||
#Rules:
|
||||
1. Based on user query and retrieved info, accurately determine metrics/dimensions user truly cares about.
|
||||
2. Do not return all retrieved info, only select those highly relevant to user query.
|
||||
3. Maintain high quality output, exclude metrics/dimensions irrelevant to user intent.
|
||||
4. Output must be in JSON array format, only include IDs from retrieved info, e.g.: ['id1', 'id2']
|
||||
5. Return JSON content directly without markdown formatting
|
||||
#Input Example:
|
||||
#User Query: {{userText}}
|
||||
#Retrieved Metrics/Dimensions: {{retrievedInfo}}
|
||||
#Output:""";
|
||||
|
||||
@Override
|
||||
public List<EmbeddingResult> detect(ChatQueryContext chatQueryContext, List<S2Term> terms,
|
||||
Set<Long> detectDataSetIds) {
|
||||
if (chatQueryContext == null || CollectionUtils.isEmpty(detectDataSetIds)) {
|
||||
log.warn("Invalid input parameters: context={}, dataSetIds={}", chatQueryContext,
|
||||
detectDataSetIds);
|
||||
return Collections.emptyList();
|
||||
}
|
||||
|
||||
// 1. Base detection
|
||||
List<EmbeddingResult> baseResults = super.detect(chatQueryContext, terms, detectDataSetIds);
|
||||
|
||||
boolean useLLM = Boolean.parseBoolean(mapperConfig.getParameterValue(EMBEDDING_MAPPER_USE_LLM));
|
||||
|
||||
// 2. LLM enhanced detection
|
||||
if (useLLM) {
|
||||
List<EmbeddingResult> llmResults = detectWithLLM(chatQueryContext, detectDataSetIds);
|
||||
if (!CollectionUtils.isEmpty(llmResults)) {
|
||||
baseResults.addAll(llmResults);
|
||||
}
|
||||
}
|
||||
|
||||
// 3. Deduplicate results
|
||||
return baseResults.stream().distinct().collect(Collectors.toList());
|
||||
}
|
||||
|
||||
/**
|
||||
* Perform enhanced detection using LLM
|
||||
*/
|
||||
private List<EmbeddingResult> detectWithLLM(ChatQueryContext chatQueryContext,
|
||||
Set<Long> detectDataSetIds) {
|
||||
try {
|
||||
String queryText = chatQueryContext.getRequest().getQueryText();
|
||||
if (StringUtils.isBlank(queryText)) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
|
||||
// Get segmentation results
|
||||
Set<String> detectSegments = extractValidSegments(queryText);
|
||||
if (CollectionUtils.isEmpty(detectSegments)) {
|
||||
log.info("No valid segments found for text: {}", queryText);
|
||||
return Collections.emptyList();
|
||||
}
|
||||
|
||||
return detectByBatch(chatQueryContext, detectDataSetIds, detectSegments, true);
|
||||
} catch (Exception e) {
|
||||
log.error("Error in LLM detection for context: {}", chatQueryContext, e);
|
||||
return Collections.emptyList();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract valid word segments by filtering out unwanted word natures
|
||||
*/
|
||||
private Set<String> extractValidSegments(String text) {
|
||||
List<String> natureList = Arrays.asList(StringUtils.split(mapperConfig.getParameterValue(EMBEDDING_MAPPER_ALLOWED_SEGMENT_NATURE ), ","));
|
||||
return HanlpHelper.getSegment().seg(text).stream()
|
||||
.filter(t -> natureList.stream().noneMatch(nature -> t.nature.startsWith(nature)))
|
||||
.map(Term::getWord).collect(Collectors.toSet());
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<EmbeddingResult> detectByBatch(ChatQueryContext chatQueryContext,
|
||||
Set<Long> detectDataSetIds, Set<String> detectSegments) {
|
||||
return detectByBatch(chatQueryContext, detectDataSetIds, detectSegments, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Process detection in batches with LLM option
|
||||
*
|
||||
* @param chatQueryContext The context of the chat query
|
||||
* @param detectDataSetIds Target dataset IDs for detection
|
||||
* @param detectSegments Segments to be detected
|
||||
* @param useLlm Whether to use LLM for filtering results
|
||||
* @return List of embedding results
|
||||
*/
|
||||
public List<EmbeddingResult> detectByBatch(ChatQueryContext chatQueryContext,
|
||||
Set<Long> detectDataSetIds, Set<String> detectSegments, boolean useLlm) {
|
||||
Set<EmbeddingResult> results = ConcurrentHashMap.newKeySet();
|
||||
int embeddingMapperBatch = Integer
|
||||
.valueOf(mapperConfig.getParameterValue(MapperConfig.EMBEDDING_MAPPER_BATCH));
|
||||
|
||||
List<String> queryTextsList =
|
||||
detectSegments.stream().map(detectSegment -> detectSegment.trim())
|
||||
.filter(detectSegment -> StringUtils.isNotBlank(detectSegment))
|
||||
.collect(Collectors.toList());
|
||||
// Process and filter query texts
|
||||
List<String> queryTextsList = detectSegments.stream().map(String::trim)
|
||||
.filter(StringUtils::isNotBlank).collect(Collectors.toList());
|
||||
|
||||
// Partition queries into sub-lists for batch processing
|
||||
List<List<String>> queryTextsSubList =
|
||||
Lists.partition(queryTextsList, embeddingMapperBatch);
|
||||
|
||||
// Create and execute tasks for each batch
|
||||
List<Callable<Void>> tasks = new ArrayList<>();
|
||||
for (List<String> queryTextsSub : queryTextsSubList) {
|
||||
tasks.add(createTask(chatQueryContext, detectDataSetIds, queryTextsSub, results));
|
||||
tasks.add(
|
||||
createTask(chatQueryContext, detectDataSetIds, queryTextsSub, results, useLlm));
|
||||
}
|
||||
executeTasks(tasks);
|
||||
|
||||
// Apply LLM filtering if enabled
|
||||
if (useLlm) {
|
||||
Map<String, Object> variable = new HashMap<>();
|
||||
variable.put("userText", chatQueryContext.getRequest().getQueryText());
|
||||
variable.put("retrievedInfo", JSONObject.toJSONString(results));
|
||||
|
||||
Prompt prompt = PromptTemplate.from(LLM_FILTER_PROMPT).apply(variable);
|
||||
ChatLanguageModel chatLanguageModel = ModelProvider.getChatModel();
|
||||
String response = chatLanguageModel.generate(prompt.toUserMessage().singleText());
|
||||
|
||||
if (StringUtils.isBlank(response)) {
|
||||
results.clear();
|
||||
} else {
|
||||
List<String> retrievedIds = JSONObject.parseArray(response, String.class);
|
||||
results = results.stream().filter(t -> retrievedIds.contains(t.getId()))
|
||||
.collect(Collectors.toSet());
|
||||
results.forEach(r -> r.setLlmMatched(true));
|
||||
}
|
||||
}
|
||||
|
||||
return new ArrayList<>(results);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a task for batch processing
|
||||
*
|
||||
* @param chatQueryContext The context of the chat query
|
||||
* @param detectDataSetIds Target dataset IDs
|
||||
* @param queryTextsSub Sub-list of query texts to process
|
||||
* @param results Shared result set for collecting results
|
||||
* @param useLlm Whether to use LLM
|
||||
* @return Callable task
|
||||
*/
|
||||
private Callable<Void> createTask(ChatQueryContext chatQueryContext, Set<Long> detectDataSetIds,
|
||||
List<String> queryTextsSub, Set<EmbeddingResult> results) {
|
||||
List<String> queryTextsSub, Set<EmbeddingResult> results, boolean useLlm) {
|
||||
return () -> {
|
||||
List<EmbeddingResult> oneRoundResults =
|
||||
detectByQueryTextsSub(detectDataSetIds, queryTextsSub, chatQueryContext);
|
||||
List<EmbeddingResult> oneRoundResults = detectByQueryTextsSub(detectDataSetIds,
|
||||
queryTextsSub, chatQueryContext, useLlm);
|
||||
synchronized (results) {
|
||||
selectResultInOneRound(results, oneRoundResults);
|
||||
}
|
||||
@@ -73,57 +203,73 @@ public class EmbeddingMatchStrategy extends BatchMatchStrategy<EmbeddingResult>
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Process a sub-list of query texts
|
||||
*
|
||||
* @param detectDataSetIds Target dataset IDs
|
||||
* @param queryTextsSub Sub-list of query texts
|
||||
* @param chatQueryContext Chat query context
|
||||
* @param useLlm Whether to use LLM
|
||||
* @return List of embedding results for this batch
|
||||
*/
|
||||
private List<EmbeddingResult> detectByQueryTextsSub(Set<Long> detectDataSetIds,
|
||||
List<String> queryTextsSub, ChatQueryContext chatQueryContext) {
|
||||
List<String> queryTextsSub, ChatQueryContext chatQueryContext, boolean useLlm) {
|
||||
Map<Long, List<Long>> modelIdToDataSetIds = chatQueryContext.getModelIdToDataSetIds();
|
||||
|
||||
// Get configuration parameters
|
||||
double threshold =
|
||||
Double.valueOf(mapperConfig.getParameterValue(EMBEDDING_MAPPER_THRESHOLD));
|
||||
|
||||
// step1. build query params
|
||||
RetrieveQuery retrieveQuery = RetrieveQuery.builder().queryTextsList(queryTextsSub).build();
|
||||
|
||||
// step2. retrieveQuery by detectSegment
|
||||
Double.parseDouble(mapperConfig.getParameterValue(EMBEDDING_MAPPER_THRESHOLD));
|
||||
int embeddingNumber =
|
||||
Integer.valueOf(mapperConfig.getParameterValue(EMBEDDING_MAPPER_NUMBER));
|
||||
Integer.parseInt(mapperConfig.getParameterValue(EMBEDDING_MAPPER_NUMBER));
|
||||
int embeddingRoundNumber =
|
||||
Integer.parseInt(mapperConfig.getParameterValue(EMBEDDING_MAPPER_ROUND_NUMBER));
|
||||
|
||||
// Build and execute query
|
||||
RetrieveQuery retrieveQuery = RetrieveQuery.builder().queryTextsList(queryTextsSub).build();
|
||||
List<RetrieveQueryResult> retrieveQueryResults = metaEmbeddingService.retrieveQuery(
|
||||
retrieveQuery, embeddingNumber, modelIdToDataSetIds, detectDataSetIds);
|
||||
|
||||
if (CollectionUtils.isEmpty(retrieveQueryResults)) {
|
||||
return new ArrayList<>();
|
||||
return Collections.emptyList();
|
||||
}
|
||||
// step3. build EmbeddingResults
|
||||
List<EmbeddingResult> collect = retrieveQueryResults.stream().map(retrieveQueryResult -> {
|
||||
List<Retrieval> retrievals = retrieveQueryResult.getRetrieval();
|
||||
if (CollectionUtils.isNotEmpty(retrievals)) {
|
||||
retrievals.removeIf(retrieval -> {
|
||||
if (!retrieveQueryResult.getQuery().contains(retrieval.getQuery())) {
|
||||
return retrieval.getSimilarity() < threshold;
|
||||
}
|
||||
return false;
|
||||
});
|
||||
|
||||
// Process results
|
||||
List<EmbeddingResult> collect = retrieveQueryResults.stream().peek(result -> {
|
||||
if (!useLlm && CollectionUtils.isNotEmpty(result.getRetrieval())) {
|
||||
result.getRetrieval()
|
||||
.removeIf(retrieval -> !result.getQuery().contains(retrieval.getQuery())
|
||||
&& retrieval.getSimilarity() < threshold);
|
||||
}
|
||||
return retrieveQueryResult;
|
||||
}).filter(retrieveQueryResult -> CollectionUtils
|
||||
.isNotEmpty(retrieveQueryResult.getRetrieval()))
|
||||
.flatMap(retrieveQueryResult -> retrieveQueryResult.getRetrieval().stream()
|
||||
.map(retrieval -> {
|
||||
EmbeddingResult embeddingResult = new EmbeddingResult();
|
||||
BeanUtils.copyProperties(retrieval, embeddingResult);
|
||||
embeddingResult.setDetectWord(retrieveQueryResult.getQuery());
|
||||
embeddingResult.setName(retrieval.getQuery());
|
||||
Map<String, String> convertedMap = retrieval.getMetadata().entrySet()
|
||||
.stream().collect(Collectors.toMap(Map.Entry::getKey,
|
||||
entry -> entry.getValue().toString()));
|
||||
embeddingResult.setMetadata(convertedMap);
|
||||
return embeddingResult;
|
||||
}))
|
||||
}).filter(result -> CollectionUtils.isNotEmpty(result.getRetrieval()))
|
||||
.flatMap(result -> result.getRetrieval().stream()
|
||||
.map(retrieval -> convertToEmbeddingResult(result, retrieval)))
|
||||
.collect(Collectors.toList());
|
||||
|
||||
// step4. select mapResul in one round
|
||||
int embeddingRoundNumber =
|
||||
Integer.valueOf(mapperConfig.getParameterValue(EMBEDDING_MAPPER_ROUND_NUMBER));
|
||||
int roundNumber = embeddingRoundNumber * queryTextsSub.size();
|
||||
return collect.stream().sorted(Comparator.comparingDouble(EmbeddingResult::getSimilarity))
|
||||
.limit(roundNumber).collect(Collectors.toList());
|
||||
// Sort and limit results
|
||||
return collect.stream()
|
||||
.sorted(Comparator.comparingDouble(EmbeddingResult::getSimilarity).reversed())
|
||||
.limit(embeddingRoundNumber * queryTextsSub.size()).collect(Collectors.toList());
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert RetrieveQueryResult and Retrieval to EmbeddingResult
|
||||
*
|
||||
* @param queryResult The query result containing retrieval information
|
||||
* @param retrieval The retrieval data to be converted
|
||||
* @return Converted EmbeddingResult
|
||||
*/
|
||||
private EmbeddingResult convertToEmbeddingResult(RetrieveQueryResult queryResult,
|
||||
Retrieval retrieval) {
|
||||
EmbeddingResult embeddingResult = new EmbeddingResult();
|
||||
BeanUtils.copyProperties(retrieval, embeddingResult);
|
||||
embeddingResult.setDetectWord(queryResult.getQuery());
|
||||
embeddingResult.setName(retrieval.getQuery());
|
||||
|
||||
// Convert metadata to string values
|
||||
Map<String, String> metadata = retrieval.getMetadata().entrySet().stream().collect(
|
||||
Collectors.toMap(Map.Entry::getKey, entry -> String.valueOf(entry.getValue())));
|
||||
embeddingResult.setMetadata(metadata);
|
||||
|
||||
return embeddingResult;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7,12 +7,7 @@ import com.tencent.supersonic.headless.chat.ChatQueryContext;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.springframework.util.CollectionUtils;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.*;
|
||||
import java.util.function.Predicate;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@@ -66,7 +61,7 @@ public class MapFilter {
|
||||
List<SchemaElementMatch> value = entry.getValue();
|
||||
if (!CollectionUtils.isEmpty(value)) {
|
||||
value.removeIf(schemaElementMatch -> StringUtils
|
||||
.length(schemaElementMatch.getDetectWord()) <= 1);
|
||||
.length(schemaElementMatch.getDetectWord()) <= 1 && !schemaElementMatch.isLlmMatched());
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -85,7 +80,7 @@ public class MapFilter {
|
||||
}
|
||||
|
||||
public static void filterByQueryDataType(ChatQueryContext chatQueryContext,
|
||||
Predicate<SchemaElement> needRemovePredicate) {
|
||||
Predicate<SchemaElement> needRemovePredicate) {
|
||||
Map<Long, List<SchemaElementMatch>> dataSetElementMatches =
|
||||
chatQueryContext.getMapInfo().getDataSetElementMatches();
|
||||
for (Map.Entry<Long, List<SchemaElementMatch>> entry : dataSetElementMatches.entrySet()) {
|
||||
|
||||
@@ -57,4 +57,12 @@ public class MapperConfig extends ParameterConfig {
|
||||
public static final Parameter EMBEDDING_MAPPER_ROUND_NUMBER =
|
||||
new Parameter("s2.mapper.embedding.round.number", "10", "向量召回最小相似度阈值",
|
||||
"向量召回相似度阈值在动态调整中的最低值", "number", "Mapper相关配置");
|
||||
|
||||
public static final Parameter EMBEDDING_MAPPER_USE_LLM =
|
||||
new Parameter("s2.mapper.embedding.use-llm-enhance", "false", "使用LLM对召回的向量进行二次判断开关",
|
||||
"embedding的结果再通过一次LLM来筛选,这时候忽略各个向量阀值", "bool", "Mapper相关配置");
|
||||
|
||||
public static final Parameter EMBEDDING_MAPPER_ALLOWED_SEGMENT_NATURE =
|
||||
new Parameter("s2.mapper.embedding.allowed-segment-nature", "['v', 'd', 'a']", "使用LLM召回二次处理时对问题分词词性的控制",
|
||||
"分词后允许的词性才会进行向量召回", "list", "Mapper相关配置");
|
||||
}
|
||||
|
||||
@@ -303,8 +303,8 @@ public class S2SemanticLayerService implements SemanticLayerService {
|
||||
|
||||
QueryStatement queryStatement = new QueryStatement();
|
||||
queryStatement.setEnableOptimize(queryUtils.enableOptimize());
|
||||
queryStatement.setLimit(Integer.parseInt(translatorConfig.getParameterValue(
|
||||
TranslatorConfig.TRANSLATOR_RESULT_LIMIT)));
|
||||
queryStatement.setLimit(Integer.parseInt(
|
||||
translatorConfig.getParameterValue(TranslatorConfig.TRANSLATOR_RESULT_LIMIT)));
|
||||
queryStatement.setDataSetId(queryReq.getDataSetId());
|
||||
queryStatement.setDataSetName(queryReq.getDataSetName());
|
||||
queryStatement.setSemanticSchema(semanticSchemaResp);
|
||||
|
||||
@@ -145,8 +145,7 @@ public class ModelServiceImpl implements ModelService {
|
||||
// Comment out below checks for now, they seem unnecessary and
|
||||
// lead to unexpected exception in updating model
|
||||
/*
|
||||
checkParams(modelReq);
|
||||
checkRelations(modelReq);
|
||||
* checkParams(modelReq); checkRelations(modelReq);
|
||||
*/
|
||||
ModelDO modelDO = modelRepository.getModelById(modelReq.getId());
|
||||
ModelConverter.convert(modelDO, modelReq, user);
|
||||
|
||||
@@ -110,7 +110,8 @@ public class ModelConverter {
|
||||
dimensionReq.setExpr(dim.getExpr());
|
||||
dimensionReq.setType(dim.getType().name());
|
||||
dimensionReq
|
||||
.setDescription(Objects.isNull(dim.getDescription()) ? "" : dim.getDescription());
|
||||
.setDescription(Objects.isNull(dim.getDescription()) ? dimensionReq.getDescription()
|
||||
: dim.getDescription());
|
||||
dimensionReq.setTypeParams(dim.getTypeParams());
|
||||
return dimensionReq;
|
||||
}
|
||||
|
||||
@@ -100,7 +100,28 @@ public class QueryUtils {
|
||||
column.setDataFormatType(metricRespMap.get(nameEn).getDataFormatType());
|
||||
column.setDataFormat(metricRespMap.get(nameEn).getDataFormat());
|
||||
column.setModelId(metricRespMap.get(nameEn).getModelId());
|
||||
} else {
|
||||
// if column nameEn contains metric name, use metric dataFormatType
|
||||
metricRespMap.values().forEach(metric -> {
|
||||
if (nameEn.contains(metric.getName()) || nameEn.contains(metric.getBizName())) {
|
||||
column.setDataFormatType(metric.getDataFormatType());
|
||||
column.setDataFormat(metric.getDataFormat());
|
||||
column.setModelId(metric.getModelId());
|
||||
}
|
||||
// if column nameEn contains metric alias, use metric dataFormatType
|
||||
if (column.getDataFormatType() == null && metric.getAlias() != null) {
|
||||
for (String alias : metric.getAlias().split(",")) {
|
||||
if (nameEn.contains(alias)) {
|
||||
column.setDataFormatType(metric.getDataFormatType());
|
||||
column.setDataFormat(metric.getDataFormat());
|
||||
column.setModelId(metric.getModelId());
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
if (dimensionRespMap.containsKey(nameEn)) {
|
||||
column.setModelId(dimensionRespMap.get(nameEn).getModelId());
|
||||
}
|
||||
@@ -119,7 +140,7 @@ public class QueryUtils {
|
||||
|| type.equalsIgnoreCase("float") || type.equalsIgnoreCase("double")
|
||||
|| type.equalsIgnoreCase("real") || type.equalsIgnoreCase("numeric")
|
||||
|| type.toLowerCase().startsWith("decimal") || type.toLowerCase().startsWith("uint")
|
||||
|| type.toLowerCase().startsWith("int");
|
||||
|| type.toLowerCase().startsWith("int") || type.toLowerCase().equalsIgnoreCase("decfloat");
|
||||
}
|
||||
|
||||
private String getName(String nameEn) {
|
||||
|
||||
@@ -18,23 +18,14 @@
|
||||
</properties>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.springdoc</groupId>
|
||||
<artifactId>springdoc-openapi-starter-webmvc-ui</artifactId>
|
||||
<version>2.1.0</version>
|
||||
<exclusions>
|
||||
<exclusion>
|
||||
<groupId>org.springframework</groupId>
|
||||
<artifactId>spring-expression</artifactId>
|
||||
</exclusion>
|
||||
<exclusion>
|
||||
<groupId>org.springframework</groupId>
|
||||
<artifactId>spring-beans</artifactId>
|
||||
</exclusion>
|
||||
<exclusion>
|
||||
<groupId>org.springframework</groupId>
|
||||
<artifactId>spring-webmvc</artifactId>
|
||||
</exclusion>
|
||||
</exclusions>
|
||||
<groupId>com.github.xiaoymin</groupId>
|
||||
<artifactId>knife4j-openapi3-jakarta-spring-boot-starter</artifactId>
|
||||
<version>4.5.0</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>jakarta.xml.bind</groupId>
|
||||
<artifactId>jakarta.xml.bind-api</artifactId>
|
||||
<version>4.0.2</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.tencent.supersonic</groupId>
|
||||
|
||||
@@ -6,6 +6,7 @@ spring:
|
||||
password: ${S2_DB_PASSWORD:}
|
||||
sql:
|
||||
init:
|
||||
continue-on-error: true
|
||||
mode: always
|
||||
username: ${S2_DB_USER:root}
|
||||
password: ${S2_DB_PASSWORD:}
|
||||
|
||||
@@ -1,14 +1,15 @@
|
||||
spring:
|
||||
datasource:
|
||||
driver-class-name: org.postgresql.Driver
|
||||
url: jdbc:postgresql://${S2_DB_HOST:localhost}:${S2_DB_PORT:5432}/${S2_DB_DATABASE:postgres}?stringtype=unspecified
|
||||
username: ${S2_DB_USER:postgres}
|
||||
password: ${S2_DB_PASSWORD:postgres}
|
||||
url: jdbc:postgresql://localhost:5432/postgres?stringtype=unspecified
|
||||
username: postgres
|
||||
password: postgres
|
||||
sql:
|
||||
init:
|
||||
continue-on-error: true
|
||||
mode: always
|
||||
username: ${S2_DB_USER:postgres}
|
||||
password: ${S2_DB_PASSWORD:postgres}
|
||||
username: postgres
|
||||
password: postgres
|
||||
schema-locations: classpath:db/schema-postgres.sql,classpath:db/schema-postgres-demo.sql
|
||||
data-locations: classpath:db/data-postgres.sql,classpath:db/data-postgres-demo.sql
|
||||
|
||||
@@ -17,9 +18,9 @@ s2:
|
||||
store:
|
||||
provider: PGVECTOR
|
||||
base:
|
||||
url: ${S2_DB_HOST:127.0.0.1}
|
||||
port: ${S2_DB_PORT:5432}
|
||||
databaseName: ${S2_DB_DATABASE:postgres}
|
||||
user: ${S2_DB_USER:postgres}
|
||||
password: ${S2_DB_PASSWORD:postgres}
|
||||
url: 127.0.0.1
|
||||
port: 5432
|
||||
databaseName: postgres
|
||||
user: postgres
|
||||
password: postgres
|
||||
dimension: 512
|
||||
@@ -30,9 +30,29 @@ logging:
|
||||
springdoc:
|
||||
swagger-ui:
|
||||
path: /swagger-ui.html
|
||||
enabled: true
|
||||
tags-sorter: alpha
|
||||
operations-sorter: alpha
|
||||
api-docs:
|
||||
path: /v3/api-docs
|
||||
enabled: true
|
||||
group-configs:
|
||||
- group: 'default'
|
||||
paths-to-match: '/**'
|
||||
packages-to-scan: com.tencent.supersonic
|
||||
paths-to-match: /api/chat/**,/api/semantic/**
|
||||
paths-to-match: /api/chat/**,/api/semantic/**
|
||||
|
||||
knife4j:
|
||||
enable: true
|
||||
openapi:
|
||||
title: 'SuperSonic API Documentation'
|
||||
description: 'SuperSonic API Documentation'
|
||||
version: v1.0
|
||||
setting:
|
||||
language: zh-CN
|
||||
# basic:
|
||||
# enable: true
|
||||
# username: test
|
||||
# password: 123456#
|
||||
documents:
|
||||
default:
|
||||
title: ChatBI API Documents
|
||||
description: ChatBI API Documents
|
||||
@@ -1,8 +1,3 @@
|
||||
-- clear data if already exists
|
||||
DELETE FROM s2_user;
|
||||
DELETE FROM s2_available_date_info;
|
||||
DELETE FROM s2_canvas;
|
||||
|
||||
-- sample user
|
||||
-- The default value for the password is 123456
|
||||
INSERT INTO s2_user (`name`, password, salt, display_name, email, is_admin) values ('admin','c3VwZXJzb25pY0BiaWNvbdktJJYWw6A3rEmBUPzbn/6DNeYnD+y3mAwDKEMS3KVT','jGl25bVBBBW96Qi9Te4V3w==','admin','admin@xx.com', 1);
|
||||
|
||||
@@ -1,8 +1,3 @@
|
||||
-- clear data if already exists
|
||||
DELETE FROM s2_user;
|
||||
DELETE FROM s2_available_date_info;
|
||||
DELETE FROM s2_canvas;
|
||||
|
||||
-- sample user
|
||||
-- The default value for the password is 123456
|
||||
insert into s2_user ("name", password, salt, display_name, email, is_admin) values ('admin','c3VwZXJzb25pY0BiaWNvbdktJJYWw6A3rEmBUPzbn/6DNeYnD+y3mAwDKEMS3KVT','jGl25bVBBBW96Qi9Te4V3w==','admin','admin@xx.com', 1);
|
||||
|
||||
@@ -41,6 +41,7 @@ CREATE TABLE IF NOT EXISTS `s2_available_date_info` (
|
||||
`updated_at` timestamp NULL,
|
||||
`updated_by` varchar(100) COLLATE utf8mb4_unicode_ci NOT NULL,
|
||||
`status` tinyint DEFAULT 0,
|
||||
UNIQUE(`item_id`, `type`),
|
||||
PRIMARY KEY (`id`)
|
||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
|
||||
|
||||
@@ -382,6 +383,7 @@ CREATE TABLE IF NOT EXISTS s2_user
|
||||
salt varchar(256) DEFAULT NULL COMMENT 'md5密码盐',
|
||||
email varchar(100) null,
|
||||
is_admin tinyint null,
|
||||
UNIQUE (`name`),
|
||||
PRIMARY KEY (`id`)
|
||||
);
|
||||
|
||||
|
||||
@@ -37,7 +37,8 @@ CREATE TABLE IF NOT EXISTS s2_available_date_info (
|
||||
created_by varchar(100) NOT NULL,
|
||||
updated_at timestamp NULL,
|
||||
updated_by varchar(100) NOT NULL,
|
||||
status smallint DEFAULT 0
|
||||
status smallint DEFAULT 0,
|
||||
UNIQUE(item_id, type)
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS s2_chat (
|
||||
@@ -491,5 +492,6 @@ CREATE TABLE IF NOT EXISTS s2_user (
|
||||
password varchar(256) NULL,
|
||||
salt varchar(256) DEFAULT NULL,
|
||||
email varchar(100) NULL,
|
||||
is_admin smallint NULL
|
||||
is_admin smallint NULL,
|
||||
UNIQUE(name)
|
||||
);
|
||||
@@ -41,3 +41,5 @@ s2:
|
||||
threshold: 0.5
|
||||
min:
|
||||
threshold: 0.3
|
||||
embedding:
|
||||
use-llm-enhance: true
|
||||
|
||||
1
pom.xml
1
pom.xml
@@ -78,7 +78,6 @@
|
||||
<spotless.version>2.27.1</spotless.version>
|
||||
<spotless.skip>false</spotless.skip>
|
||||
<stax2.version>4.2.1</stax2.version>
|
||||
<io.springfox.version>3.0.0</io.springfox.version>
|
||||
<aws-java-sdk.version>1.12.780</aws-java-sdk.version>
|
||||
</properties>
|
||||
|
||||
|
||||
@@ -51,6 +51,7 @@ const getCreateFieldName = (type: EnumDataSourceType) => {
|
||||
EnumDataSourceType.CATEGORICAL,
|
||||
EnumDataSourceType.TIME,
|
||||
EnumDataSourceType.PARTITION_TIME,
|
||||
EnumDataSourceType.FOREIGN,
|
||||
].includes(type as EnumDataSourceType)
|
||||
? 'isCreateDimension'
|
||||
: 'isCreateMetric';
|
||||
@@ -101,7 +102,7 @@ const ModelFieldForm: React.FC<Props> = ({
|
||||
value={selectTypeValue}
|
||||
allowClear
|
||||
onChange={(value) => {
|
||||
let defaultParams = {};
|
||||
let defaultParams:any = {};
|
||||
if (value === EnumDataSourceType.MEASURES) {
|
||||
defaultParams = {
|
||||
agg: AGG_OPTIONS[0].value,
|
||||
@@ -127,12 +128,13 @@ const ModelFieldForm: React.FC<Props> = ({
|
||||
};
|
||||
} else {
|
||||
defaultParams = {
|
||||
type: value,
|
||||
agg: undefined,
|
||||
dateFormat: undefined,
|
||||
timeGranularity: undefined,
|
||||
};
|
||||
}
|
||||
const isCreateName = getCreateFieldName(value);
|
||||
const isCreateName = getCreateFieldName(defaultParams.type);
|
||||
const editState = !isUndefined(record[isCreateName])
|
||||
? !!record[isCreateName]
|
||||
: true;
|
||||
|
||||
Reference in New Issue
Block a user