diff --git a/chat/core/src/main/java/com/tencent/supersonic/chat/agent/tool/AgentTool.java b/chat/core/src/main/java/com/tencent/supersonic/chat/agent/tool/AgentTool.java index 4ff8efe01..ff5c59029 100644 --- a/chat/core/src/main/java/com/tencent/supersonic/chat/agent/tool/AgentTool.java +++ b/chat/core/src/main/java/com/tencent/supersonic/chat/agent/tool/AgentTool.java @@ -9,6 +9,8 @@ import lombok.NoArgsConstructor; @AllArgsConstructor public class AgentTool { + private String id; + private String name; private AgentToolType type; diff --git a/chat/core/src/main/java/com/tencent/supersonic/chat/mapper/MapperHelper.java b/chat/core/src/main/java/com/tencent/supersonic/chat/mapper/MapperHelper.java index 005a57ac5..a69156a0b 100644 --- a/chat/core/src/main/java/com/tencent/supersonic/chat/mapper/MapperHelper.java +++ b/chat/core/src/main/java/com/tencent/supersonic/chat/mapper/MapperHelper.java @@ -14,7 +14,6 @@ import java.util.Set; import java.util.stream.Collectors; import lombok.Data; import lombok.extern.slf4j.Slf4j; -import org.apache.commons.collections.CollectionUtils; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Service; @@ -91,7 +90,7 @@ public class MapperHelper { Set detectModelIds = agentService.getDslToolsModelIds(request.getAgentId(), null); //contains all - if (isContainsAllModel(detectModelIds)) { + if (agentService.containsAllModel(detectModelIds)) { if (Objects.nonNull(modelId) && modelId > 0) { Set result = new HashSet<>(); result.add(modelId); @@ -113,9 +112,4 @@ public class MapperHelper { } return detectModelIds; } - - private boolean isContainsAllModel(Set detectModelIds) { - return CollectionUtils.isNotEmpty(detectModelIds) && detectModelIds.contains(-1L); - } - } diff --git a/chat/core/src/main/java/com/tencent/supersonic/chat/parser/llm/dsl/LLMDslParser.java b/chat/core/src/main/java/com/tencent/supersonic/chat/parser/llm/dsl/LLMDslParser.java index 82f3f26b0..6678f0849 100644 --- a/chat/core/src/main/java/com/tencent/supersonic/chat/parser/llm/dsl/LLMDslParser.java +++ b/chat/core/src/main/java/com/tencent/supersonic/chat/parser/llm/dsl/LLMDslParser.java @@ -40,6 +40,7 @@ import com.tencent.supersonic.semantic.api.query.enums.FilterOperatorEnum; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; +import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Objects; @@ -66,8 +67,12 @@ public class LLMDslParser implements SemanticParser { public void parse(QueryContext queryCtx, ChatContext chatCtx) { QueryReq request = queryCtx.getRequest(); LLMConfig llmConfig = ContextUtils.getBean(LLMConfig.class); - if (StringUtils.isEmpty(llmConfig.getUrl()) || SatisfactionChecker.check(queryCtx)) { - log.info("llmConfig:{}, skip dsl parser, queryText:{}", llmConfig, request.getQueryText()); + if (StringUtils.isEmpty(llmConfig.getUrl())) { + log.info("llm url is empty, skip dsl parser, llmConfig:{}", llmConfig); + return; + } + if (SatisfactionChecker.check(queryCtx)) { + log.info("skip dsl parser, queryText:{}", request.getQueryText()); return; } try { @@ -88,8 +93,8 @@ public class LLMDslParser implements SemanticParser { if (Objects.isNull(llmResp)) { return; } - DSLParseResult dslParseResult = DSLParseResult.builder().request(request).dslTool(dslTool).llmReq(llmReq) - .llmResp(llmResp).build(); + DSLParseResult dslParseResult = DSLParseResult.builder().request(request) + .dslTool(dslTool).llmReq(llmReq).llmResp(llmResp).build(); SemanticParseInfo parseInfo = getParseInfo(queryCtx, modelId, dslTool, dslParseResult); @@ -287,7 +292,14 @@ public class LLMDslParser implements SemanticParser { private DslTool getDslTool(QueryReq request, Long modelId) { AgentService agentService = ContextUtils.getBean(AgentService.class); List dslTools = agentService.getDslTools(request.getAgentId(), AgentToolType.DSL); - Optional dslToolOptional = dslTools.stream().filter(tool -> tool.getModelIds().contains(modelId)) + Optional dslToolOptional = dslTools.stream() + .filter(tool -> { + List modelIds = tool.getModelIds(); + if (agentService.containsAllModel(new HashSet<>(modelIds))) { + return true; + } + return modelIds.contains(modelId); + }) .findFirst(); return dslToolOptional.orElse(null); } @@ -295,6 +307,9 @@ public class LLMDslParser implements SemanticParser { private Long getModelId(QueryContext queryCtx, ChatContext chatCtx, Integer agentId) { AgentService agentService = ContextUtils.getBean(AgentService.class); Set distinctModelIds = agentService.getDslToolsModelIds(agentId, AgentToolType.DSL); + if (agentService.containsAllModel(distinctModelIds)) { + distinctModelIds = new HashSet<>(); + } ModelResolver modelResolver = ComponentFactory.getModelResolver(); Long modelId = modelResolver.resolve(queryCtx, chatCtx, distinctModelIds); log.info("resolve modelId:{},dslModels:{}", modelId, distinctModelIds); diff --git a/chat/core/src/main/java/com/tencent/supersonic/chat/parser/plugin/function/HeuristicModelResolver.java b/chat/core/src/main/java/com/tencent/supersonic/chat/parser/plugin/function/HeuristicModelResolver.java index b66217418..f3c01a0a4 100644 --- a/chat/core/src/main/java/com/tencent/supersonic/chat/parser/plugin/function/HeuristicModelResolver.java +++ b/chat/core/src/main/java/com/tencent/supersonic/chat/parser/plugin/function/HeuristicModelResolver.java @@ -24,7 +24,7 @@ import org.apache.commons.collections.CollectionUtils; public class HeuristicModelResolver implements ModelResolver { protected static Long selectModelBySchemaElementCount(Map modelQueryModes, - SchemaMapInfo schemaMap) { + SchemaMapInfo schemaMap) { Map modelTypeMap = getModelTypeMap(schemaMap); if (modelTypeMap.size() == 1) { Long modelSelect = modelTypeMap.entrySet().stream().collect(Collectors.toList()).get(0).getKey(); @@ -57,8 +57,8 @@ public class HeuristicModelResolver implements ModelResolver { * @return false will use context Model, true will use other Model , maybe include context Model */ protected static boolean isAllowSwitch(Map modelQueryModes, SchemaMapInfo schemaMap, - ChatContext chatCtx, QueryReq searchCtx, - Long modelId, Set restrictiveModels) { + ChatContext chatCtx, QueryReq searchCtx, + Long modelId, Set restrictiveModels) { if (!Objects.nonNull(modelId) || modelId <= 0) { return true; } @@ -137,7 +137,10 @@ public class HeuristicModelResolver implements ModelResolver { public Long resolve(QueryContext queryContext, ChatContext chatCtx, Set restrictiveModels) { Long modelId = queryContext.getRequest().getModelId(); if (Objects.nonNull(modelId) && modelId > 0) { - if (CollectionUtils.isNotEmpty(restrictiveModels) && restrictiveModels.contains(modelId)) { + if (CollectionUtils.isEmpty(restrictiveModels)) { + return modelId; + } + if (restrictiveModels.contains(modelId)) { return modelId; } else { return null; @@ -162,7 +165,7 @@ public class HeuristicModelResolver implements ModelResolver { } public Long resolve(Map modelQueryModes, QueryContext queryContext, - ChatContext chatCtx, SchemaMapInfo schemaMap, Set restrictiveModels) { + ChatContext chatCtx, SchemaMapInfo schemaMap, Set restrictiveModels) { Long selectModel = selectModel(modelQueryModes, queryContext.getRequest(), chatCtx, schemaMap, restrictiveModels); if (selectModel > 0) { @@ -174,8 +177,8 @@ public class HeuristicModelResolver implements ModelResolver { } public Long selectModel(Map modelQueryModes, QueryReq queryContext, - ChatContext chatCtx, - SchemaMapInfo schemaMap, Set restrictiveModels) { + ChatContext chatCtx, + SchemaMapInfo schemaMap, Set restrictiveModels) { // if QueryContext has modelId and in ModelQueryModes if (modelQueryModes.containsKey(queryContext.getModelId())) { log.info("selectModel from QueryContext [{}]", queryContext.getModelId()); diff --git a/chat/core/src/main/java/com/tencent/supersonic/chat/service/AgentService.java b/chat/core/src/main/java/com/tencent/supersonic/chat/service/AgentService.java index 8ce2308d8..087c5265e 100644 --- a/chat/core/src/main/java/com/tencent/supersonic/chat/service/AgentService.java +++ b/chat/core/src/main/java/com/tencent/supersonic/chat/service/AgentService.java @@ -23,4 +23,5 @@ public interface AgentService { Set getDslToolsModelIds(Integer agentId, AgentToolType agentToolType); + boolean containsAllModel(Set detectModelIds); } diff --git a/chat/core/src/main/java/com/tencent/supersonic/chat/service/impl/AgentServiceImpl.java b/chat/core/src/main/java/com/tencent/supersonic/chat/service/impl/AgentServiceImpl.java index df1651f0c..889776906 100644 --- a/chat/core/src/main/java/com/tencent/supersonic/chat/service/impl/AgentServiceImpl.java +++ b/chat/core/src/main/java/com/tencent/supersonic/chat/service/impl/AgentServiceImpl.java @@ -109,4 +109,9 @@ public class AgentServiceImpl implements AgentService { .flatMap(Collection::stream) .collect(Collectors.toSet()); } + + @Override + public boolean containsAllModel(Set detectModelIds) { + return !CollectionUtils.isEmpty(detectModelIds) && detectModelIds.contains(-1L); + } } diff --git a/chat/core/src/main/python/llm/api_service.py b/chat/core/src/main/python/llm/api_service.py index a5bd975f2..c06748b74 100644 --- a/chat/core/src/main/python/llm/api_service.py +++ b/chat/core/src/main/python/llm/api_service.py @@ -25,20 +25,31 @@ app = FastAPI() @app.post("/query2sql/") async def din_query2sql(query_body: Mapping[str, Any]): - if 'queryText' not in query_body: - raise HTTPException(status_code=400, + if 'queryText' not in query_body: + raise HTTPException(status_code=400, detail="query_text is not in query_body") - else: - query_text = query_body['queryText'] + else: + query_text = query_body['queryText'] - if 'schema' not in query_body: - raise HTTPException(status_code=400, detail="schema is not in query_body") - else: - schema = query_body['schema'] + if 'schema' not in query_body: + raise HTTPException(status_code=400, detail="schema is not in query_body") + else: + schema = query_body['schema'] - resp = query2sql(query_text=query_text, schema=schema) + if 'currentDate' not in query_body: + raise HTTPException(status_code=400, detail="currentDate is not in query_body") + else: + current_date = query_body['currentDate'] - return resp + if 'linking' not in query_body: + linking = None + else: + linking = query_body['linking'] + + resp = query2sql(query_text=query_text, + schema=schema, current_date=current_date, linking=linking) + + return resp @app.post("/preset_query_retrival/") diff --git a/chat/core/src/main/python/llm/few_shot_example/sql_exampler.py b/chat/core/src/main/python/llm/few_shot_example/sql_exampler.py index 990dc8ec8..87a32dc3d 100644 --- a/chat/core/src/main/python/llm/few_shot_example/sql_exampler.py +++ b/chat/core/src/main/python/llm/few_shot_example/sql_exampler.py @@ -1,147 +1,296 @@ -examplars= [ - { +examplars= [ + { "current_date":"2020-12-01", "table_name":"内容库产品", - "fields_list":"""["部门", "模块", "用户名", "访问次数", "访问人数", "访问时长"]""", - "question":"比较jerry和tom在内容库的访问次数", - "analysis": """让我们一步一步地思考。在问题“比较jerry和tom在内容库的访问次数“中,我们被问: -“内容库的访问次数”,所以我们需要column=[访问次数] -”比较jerry和tom“,所以我们需要column=[用户名] -基于table和columns,可能的cell values 是 = ['jerry', 'tom']。""", - "schema_links":"""["访问次数", "用户名", "'jerry'", "'tom'"]""", - "sql":"""select 用户名, 访问次数 from 内容库产品 where 用户名 in ('jerry', 'tom')""" - }, - { + "fields_list":"""["部门", "模块", "用户名", "访问次数", "访问人数", "访问时长", "数据日期"]""", + "question":"比较jackjchen和robinlee在内容库的访问次数", + "prior_schema_links":"""['jackjchen'->用户名, 'robinlee'->用户名]""", + "analysis": """让我们一步一步地思考。在问题“比较jackjchen和robinlee在内容库的访问次数“中,我们被问: +“比较jackjchen和robinlee”,所以我们需要column=[用户名] +”内容库的访问次数“,所以我们需要column=[访问次数] +基于table和columns,可能的cell values 是 = ['jackjchen', 'robinlee']。""", + "schema_links":"""["用户名", "访问次数", "'jackjchen'", "'robinlee'"]""", + "sql":"""select 用户名, 访问次数 from 内容库产品 where 用户名 in ('jackjchen', 'robinlee') and 数据日期 = '2020-12-01' """ + }, + { "current_date":"2022-11-06", "table_name":"内容库产品", - "fields_list":"""["部门", "模块", "用户名", "访问次数", "访问人数", "访问时长"]""", + "fields_list":"""["部门", "模块", "用户名", "访问次数", "访问人数", "访问时长", "数据日期"]""", "question":"内容库近12个月访问人数 按部门", + "prior_schema_links":"""[]""", "analysis": """让我们一步一步地思考。在问题“内容库近12个月访问人数 按部门“中,我们被问: -“内容库近12个月访问人数”,所以我们需要column=[访问人数] +”内容库近12个月“,所以我们需要column=[数据日期] +“访问人数”,所以我们需要column=[访问人数] ”按部门“,所以我们需要column=[部门] -基于table和columns,可能的cell values 是 = []。""", - "schema_links":"""["访问人数", "部门"]""", - "sql":"""select 部门, sum(访问人数) from 内容库产品 where 部门 group by 部门""" - }, - { +基于table和columns,可能的cell values 是 = [12]。""", + "schema_links":"""["访问人数", "部门", "数据日期", 12]""", + "sql":"""select 部门, 数据日期, 访问人数 from 内容库产品 where datediff('month', 数据日期, '2022-11-06') <= 12 """ + }, + { "current_date":"2023-04-21", "table_name":"内容库产品", - "fields_list":"""["部门", "模块", "用户名", "访问次数", "访问人数", "访问时长"]""", - "question":"内容库编辑部、美术部的访问时长", - "analysis": """让我们一步一步地思考。在问题“内容库编辑部、美术部的访问时长“中,我们被问: + "fields_list":"""["部门", "模块", "用户名", "访问次数", "访问人数", "访问时长", "数据日期"]""", + "question":"内容库美术部、技术研发部的访问时长", + "prior_schema_links":"""['美术部'->部门, '技术研发部'->部门]""", + "analysis": """让我们一步一步地思考。在问题“内容库美术部、技术研发部的访问时长“中,我们被问: “访问时长”,所以我们需要column=[访问时长] -”内容库编辑部、美术部“,所以我们需要column=[部门] -基于table和columns,可能的cell values 是 = ['编辑部', '美术部']。""", - "schema_links":"""["访问时长", "部门", "'编辑部'", "'美术部'"]""", - "sql":"""select 部门, 访问时长 from 内容库产品 where 部门 in ('编辑部', '美术部')""" - }, - { - "table_name":"精选", - "fields_list":"""['归属系', '付费模式', '结算播放份额', '付费用户结算播放份额']""", - "question":"近3天飞天系结算播放份额", - "analysis": """让我们一步一步地思考。在问题“近3天飞天系结算播放份额“中,我们被问: -“结算播放份额”,所以我们需要column=[结算播放份额] -”飞天系“,所以我们需要column=[归属系] -基于table和columns,可能的cell values 是 = ['飞天系']。""", - "schema_links":"""["结算播放份额", "归属系", "'飞天系'"]""", - "sql":"""select 归属系, 结算播放份额 from 精选 where 归属系 in ('飞天系')""" - }, - { +”内容库美术部、技术研发部“,所以我们需要column=[部门] +基于table和columns,可能的cell values 是 = ['美术部', '技术研发部']。""", + "schema_links":"""["访问时长", "部门", "'美术部'", "'技术研发部'"]""", + "sql":"""select 部门, 访问时长 from 内容库产品 where 部门 in ('美术部', '技术研发部') and 数据日期 = '2023-04-21' """ + }, + { "current_date":"2023-08-21", + "table_name":"严选", + "fields_list":"""["严选版权归属系", "付费模式", "结算播放份额", "付费用户结算播放份额", "数据日期"]""", + "question":"近3天海田飞系MPPM结算播放份额", + "prior_schema_links":"""['海田飞系'->严选版权归属系]""", + "analysis": """让我们一步一步地思考。在问题“近3天海田飞系MPPM结算播放份额“中,我们被问: +“MPPM结算播放份额”,所以我们需要column=[结算播放份额] +”海田飞系“,所以我们需要column=[严选版权归属系] +”近3天“,所以我们需要column=[数据日期] +基于table和columns,可能的cell values 是 = ['海田飞系', 3]。""", + "schema_links":"""["结算播放份额", "严选版权归属系", "数据日期", "'海田飞系'", 3]""", + "sql":"""select 严选版权归属系, 结算播放份额 from 严选 where 严选版权归属系 = '海田飞系' and datediff('day', 数据日期, '2023-08-21') <= 3 """ + }, + { "current_date":"2023-05-22", "table_name":"歌曲库", - "fields_list":"""['歌曲ID', '歌曲MID', '歌曲名', '歌曲版本', '歌曲类型', '翻唱类型', '结算播放量', '运营播放量', '付费用户结算播放量', '历史累计结算播放量', '运营搜播量', '结算搜播量', '运营完播量', '运营推播量', '近7日复播率', '日均搜播量']""", - "question":"对比近3天翻唱版和纯音乐的歌曲播放量", + "fields_list":"""["是否潮流人歌曲", "C音歌曲ID", "C音歌曲MID", "歌曲名", "歌曲版本", "语种", "歌曲类型", "翻唱类型", "MPPM歌曲ID", "是否严选窄口径歌曲", "是否严选宽口径歌曲", "结算播放量", "运营播放量", "付费用户结算播放量", "历史累计结算播放量", "运营搜播量", "结算搜播量", "运营完播量", "运营推播量", "近7日复播率", "日均搜播量", "数据日期"]""", + "question":"对比近7天翻唱版和纯音乐的歌曲播放量", + "prior_schema_links":"""['纯音乐'->语种, '翻唱版'->歌曲版本]""", "analysis": """让我们一步一步地思考。在问题“对比近3天翻唱版和纯音乐的歌曲播放量“中,我们被问: “歌曲播放量”,所以我们需要column=[结算播放量] -”翻唱版和纯音乐“,所以我们需要column=[歌曲类型] -基于table和columns,可能的cell values 是 = ['翻唱版', '纯音乐']。""", - "schema_links":"""["结算播放量", "歌曲类型", "'翻唱版'", "'纯音乐'"]""", - "sql":"""select 歌曲类型, 结算播放量 from 歌曲库 where 歌曲类型 in ('翻唱版', '纯音乐')""" - }, - { +”翻唱版“,所以我们需要column=[歌曲版本] +”和纯音乐的歌曲“,所以我们需要column=[语种] +”近7天“,所以我们需要column=[数据日期] +基于table和columns,可能的cell values 是 = ['翻唱版', '纯音乐', 7]。""", + "schema_links":"""["结算播放量", "歌曲版本", "语种", "数据日期", "'翻唱版'", "'纯音乐'", 7]""", + "sql":"""select 歌曲版本, 语种, 结算播放量 from 歌曲库 where 歌曲版本 = '翻唱版' and 语种 = '纯音乐' and datediff('day', 数据日期, '2023-05-22') <= 7 """ + }, + { "current_date":"2023-05-31", "table_name":"艺人库", - "fields_list":"""['上下架状态', '歌手名', '歌手等级', '歌手类型', '歌手来源', '活跃区域', '年龄', '歌手才能', '歌手风格', '粉丝数', '在架歌曲数', '有播放量歌曲数']""", - "question":"对比一下流得滑、锅富程、章雪友的粉丝数", - "analysis": """让我们一步一步地思考。在问题“对比一下流得滑、锅富程、章雪友的粉丝数“中,我们被问: + "fields_list":"""["上下架状态", "歌手名", "歌手等级", "歌手类型", "歌手来源", "MPPM潮流人等级", "活跃区域", "年龄", "歌手才能", "歌手风格", "粉丝数", "潮音粉丝数", "超声波粉丝数", "推博粉丝数", "超声波歌曲数", "在架歌曲数", "超声波分享数", "独占歌曲数", "超声波在架歌曲评论数", "有播放量歌曲数", "数据日期"]""", + "question":"对比一下陈拙悬、孟梅琦、赖媚韵的粉丝数", + "prior_schema_links":"""['1527896'->MPPM歌手ID, '1565463'->MPPM歌手ID, '2141459'->MPPM歌手ID]""", + "analysis": """让我们一步一步地思考。在问题“对比一下陈拙悬、孟梅琦、赖媚韵的粉丝数“中,我们被问: “粉丝数”,所以我们需要column=[粉丝数] -”流得滑、锅富程、章雪友“,所以我们需要column=[歌手名] -基于table和columns,可能的cell values 是 = ['流得滑', '锅富程', '章雪友']。""", - "schema_links":"""["粉丝数", "歌手名", "'流得滑'", "'锅富程'", "'章雪友'"]""", - "sql":"""select 歌手名, 粉丝数 from 艺人库 where 歌手名 in ('流得滑', '锅富程', '章雪友')""" - }, - { +”陈拙悬、孟梅琦、赖媚韵“,所以我们需要column=[歌手名] +基于table和columns,可能的cell values 是 = ['陈拙悬', '孟梅琦', '赖媚韵']。""", + "schema_links":"""["粉丝数", "歌手名", "'陈拙悬'", "'孟梅琦'", "'赖媚韵'"]""", + "sql":"""select 歌手名, 粉丝数 from 艺人库 where 歌手名 in ('陈拙悬', '孟梅琦', '赖媚韵') and 数据日期 = '2023-05-31' """ + }, + { "current_date":"2023-07-31", "table_name":"歌曲库", - "fields_list":"""['歌曲ID', '歌曲MID', '歌曲名', '歌曲版本', '歌曲类型', '翻唱类型', '结算播放量', '运营播放量', '付费用户结算播放量', '历史累计结算播放量', '运营搜播量', '结算搜播量', '运营完播量', '运营推播量', '近7日复播率', '日均搜播量']""", + "fields_list":"""["歌曲名", "歌曲版本", "歌曲类型", "MPPM歌曲ID", "是否严选窄口径歌曲", "是否严选宽口径歌曲", "是否潮流人歌曲", "超声波歌曲ID", "C音歌曲ID", "C音歌曲MID", "结算播放量", "运营播放量", "分享量", "收藏量", "运营搜播量", "结算搜播量", "拉新用户数", "拉活用户数", "分享率", "结算播放份额", "数据日期"]""", "question":"播放量大于1万的歌曲有多少", + "prior_schema_links":"""[]""", "analysis": """让我们一步一步地思考。在问题“播放量大于1万的歌曲有多少“中,我们被问: “歌曲有多少”,所以我们需要column=[歌曲名] -”播放量大于1万“,所以我们需要column=[结算播放量] +”播放量大于1万的“,所以我们需要column=[结算播放量] 基于table和columns,可能的cell values 是 = [10000]。""", "schema_links":"""["歌曲名", "结算播放量", 10000]""", - "sql":"""select 歌曲名 from 歌曲库 where 结算播放量 > 10000""" - }, - { + "sql":"""select 歌曲名 from 歌曲库 where 结算播放量 > 10000 and 数据日期 = '2023-07-31' """ + }, + { "current_date":"2023-07-31", "table_name":"内容库产品", - "fields_list":"""['用户名', '部门', '模块', '访问时长', '访问次数', '访问人数']""", + "fields_list":"""["用户名", "部门", "模块", "访问时长", "访问次数", "访问人数", "数据日期"]""", "question":"内容库访问时长小于1小时,且来自美术部的用户是哪些", + "prior_schema_links":"""['美术部'->部门]""", "analysis": """让我们一步一步地思考。在问题“内容库访问时长小于1小时,且来自美术部的用户是哪些“中,我们被问: “用户是哪些”,所以我们需要column=[用户名] ”美术部的“,所以我们需要column=[部门] ”访问时长小于1小时“,所以我们需要column=[访问时长] 基于table和columns,可能的cell values 是 = ['美术部', 1]。""", "schema_links":"""["用户名", "部门", "访问时长", "'美术部'", 1]""", - "sql":"""select 用户名 from 内容库产品 where 部门 = '美术部' and 访问时长 < 1""" - }, - { + "sql":"""select 用户名 from 内容库产品 where 部门 = '美术部' and 访问时长 < 1 and 数据日期 = '2023-07-31' """ + }, + { "current_date":"2023-08-31", "table_name":"内容库产品", - "fields_list":"""['用户名', '部门', '模块', '访问次数', '访问人数', '访问时长']""", + "fields_list":"""["用户名", "部门", "模块", "访问时长", "访问次数", "访问人数", "数据日期"]""", "question":"内容库pv最高的用户有哪些", + "prior_schema_links":"""[]""", "analysis": """让我们一步一步地思考。在问题“内容库pv最高的用户有哪些“中,我们被问: “用户有哪些”,所以我们需要column=[用户名] ”pv最高的“,所以我们需要column=[访问次数] 基于table和columns,可能的cell values 是 = []。""", "schema_links":"""["用户名", "访问次数"]""", - "sql":"""select 用户名 from 内容库产品 order by 访问次数 desc limit 10""" - }, - { + "sql":"""select 用户名 from 内容库产品 where 数据日期 = '2023-08-31' order by 访问次数 desc limit 10 """ + }, + { "current_date":"2023-08-31", "table_name":"艺人库", - "fields_list":"""['歌手名', '歌手等级', '歌手类型', '歌手来源', '结算播放量', '运营播放量', '历史累计结算播放量', '有播放量歌曲数', '历史累计运营播放量', '付费用户结算播放量', '结算播放量占比', '运营播放份额', '完播量']""", - "question":"近90天袁呀味播放量平均值是多少", - "analysis": """让我们一步一步地思考。在问题“近90天袁呀味播放量平均值是多少“中,我们被问: + "fields_list":"""["播放量层级", "播放量单调性", "播放量方差", "播放量突增类型", "播放量集中度", "歌手名", "歌手等级", "歌手类型", "歌手来源", "MPPM潮流人等级", "结算播放量", "运营播放量", "历史累计结算播放量", "有播放量歌曲数", "历史累计运营播放量", "付费用户结算播放量", "结算播放量占比", "运营播放份额", "免费用户结算播放占比", "完播量", "数据日期"]""", + "question":"近90天袁亚伟播放量平均值是多少", + "prior_schema_links":"""['152789226'->MPPM歌手ID]""", + "analysis": """让我们一步一步地思考。在问题“近90天袁亚伟播放量平均值是多少“中,我们被问: “播放量平均值是多少”,所以我们需要column=[结算播放量] -”袁呀味“,所以我们需要column=[歌手名] -基于table和columns,可能的cell values 是 = ['袁呀味']。""", - "schema_links":"""["结算播放量", "歌手名", "'袁呀味'"]""", - "sql":"""select avg(结算播放量) from 艺人库 where 歌手名 = '袁呀味'""" - }, - { +”袁亚伟“,所以我们需要column=[歌手名] +”近90天“,所以我们需要column=[数据日期] +基于table和columns,可能的cell values 是 = ['袁亚伟', 90]。""", + "schema_links":"""["结算播放量", "歌手名", "数据日期", "'袁亚伟'", 90]""", + "sql":"""select avg(结算播放量) from 艺人库 where 歌手名 = '袁亚伟' and datediff('day', 数据日期, '2023-08-31') <= 90 """ + }, + { "current_date":"2023-08-31", "table_name":"艺人库", - "fields_list":"""['歌手名', '歌手等级', '歌手类型', '歌手来源', '结算播放量', '运营播放量', '历史累计结算播放量', '有播放量歌曲数', '历史累计运营播放量', '付费用户结算播放量', '结算播放量占比', '运营播放份额', '完播量']""", - "question":"周浅近7天结算播放量总和是多少", - "analysis": """让我们一步一步地思考。在问题“周浅近7天结算播放量总和是多少“中,我们被问: + "fields_list":"""["播放量层级", "播放量单调性", "播放量方差", "播放量突增类型", "播放量集中度", "歌手名", "歌手等级", "歌手类型", "歌手来源", "MPPM潮流人等级", "结算播放量", "运营播放量", "历史累计结算播放量", "有播放量歌曲数", "历史累计运营播放量", "付费用户结算播放量", "结算播放量占比", "运营播放份额", "免费用户结算播放占比", "完播量", "数据日期"]""", + "question":"周倩倩近7天结算播放量总和是多少", + "prior_schema_links":"""['199509'->MPPM歌手ID]""", + "analysis": """让我们一步一步地思考。在问题“周倩倩近7天结算播放量总和是多少“中,我们被问: “结算播放量总和是多少”,所以我们需要column=[结算播放量] -”周浅“,所以我们需要column=[歌手名] -基于table和columns,可能的cell values 是 = ['周浅']。""", - "schema_links":"""["结算播放量", "歌手名", "'周浅'"]""", - "sql":"""select sum(结算播放量) from 艺人库 where 歌手名 = '周浅'""" - }, - { +”周倩倩“,所以我们需要column=[歌手名] +”近7天“,所以我们需要column=[数据日期] +基于table和columns,可能的cell values 是 = ['周倩倩', 7]。""", + "schema_links":"""["结算播放量", "歌手名", "数据日期", "'周倩倩'", 7]""", + "sql":"""select sum(结算播放量) from 艺人库 where 歌手名 = '周倩倩' and datediff('day', 数据日期, '2023-08-31') <= 7 """ + }, + { "current_date":"2023-09-14", "table_name":"内容库产品", - "fields_list":"""['部门', '模块', '用户名', '访问次数', '访问人数', '访问时长']""", + "fields_list":"""["部门", "模块", "用户名", "访问次数", "访问人数", "访问时长", "数据日期"]""", "question":"内容库访问次数大于1k的部门是哪些", + "prior_schema_links":"""[]""", "analysis": """让我们一步一步地思考。在问题“内容库访问次数大于1k的部门是哪些“中,我们被问: “部门是哪些”,所以我们需要column=[部门] ”访问次数大于1k的“,所以我们需要column=[访问次数] 基于table和columns,可能的cell values 是 = [1000]。""", "schema_links":"""["部门", "访问次数", 1000]""", - "sql":"""select 部门 from 内容库产品 where 访问次数 > 1000""" - }, - { + "sql":"""select 部门 from 内容库产品 where 访问次数 > 1000 and 数据日期 = '2023-09-14' """ + }, + { "current_date":"2023-09-18", "table_name":"歌曲库", - "fields_list":"""['歌曲ID', '歌曲MID', '歌曲名', '歌曲版本', '歌曲类型', '翻唱类型', '结算播放量', '运营播放量', '付费用户结算播放量', '历史累计结算播放量', '运营搜播量', '结算搜播量', '运营完播量', '运营推播量', '近7日复播率', '日均搜播量']""", - "question":"陈奕迅唱的所有的播放量大于20k的雇佣者有哪些", - "analysis": """让我们一步一步地思考。在问题“陈易迅唱的所有的播放量大于20k的雇佣者有哪些“中,我们被问: -“雇佣者有哪些”,所以我们需要column=[歌曲名] + "fields_list":"""["歌曲名", "MPPM歌手ID", "歌曲版本", "歌曲类型", "MPPM歌曲ID", "是否严选窄口径歌曲", "是否严选宽口径歌曲", "是否潮流人歌曲", "超声波歌曲ID", "C音歌曲ID", "C音歌曲MID", "结算播放量", "运营播放量", "分享量", "收藏量", "运营搜播量", "结算搜播量", "拉新用户数", "拉活用户数", "分享率", "结算播放份额", "数据日期"]""", + "question":"陈亿训唱的所有的播放量大于20k的孤勇者有哪些", + "prior_schema_links":"""['199509'->MPPM歌手ID, '1527123'->MPPM歌曲ID]""", + "analysis": """让我们一步一步地思考。在问题“陈亿训唱的所有的播放量大于20k的孤勇者有哪些“中,我们被问: +“孤勇者有哪些”,所以我们需要column=[歌曲名] ”播放量大于20k的“,所以我们需要column=[结算播放量] -”陈易迅唱的“,所以我们需要column=[歌手名] -基于table和columns,可能的cell values 是 = [20000, '陈易迅']。""", - "schema_links":"""["歌曲名", "结算播放量", "歌手名", 20000, "'陈易迅'"]""", - "sql":"""select 歌曲名 from 歌曲库 where 结算播放量 > 20000 and 歌手名 = '陈易迅'""" - } +”陈亿训唱的“,所以我们需要column=[歌手名] +基于table和columns,可能的cell values 是 = [20000, '陈亿训', '孤勇者']。""", + "schema_links":"""["歌曲名", "结算播放量", "歌手名", 20000, "'陈亿训'", "'孤勇者'"]""", + "sql":"""select 歌曲名 from 歌曲库 where 结算播放量 > 20000 and 歌手名 = '陈亿训' and 歌曲名 = '孤勇者' and 数据日期 = '2023-09-18' """ + }, + { "current_date":"2023-09-18", + "table_name":"歌曲库", + "fields_list":"""["歌曲名", "歌曲版本", "歌手名", "歌曲类型", "发布时间", "MPPM歌曲ID", "是否严选窄口径歌曲", "是否严选宽口径歌曲", "是否潮流人歌曲", "超声波歌曲ID", "C音歌曲ID", "C音歌曲MID", "结算播放量", "运营播放量", "分享量", "收藏量", "运营搜播量", "结算搜播量", "拉新用户数", "拉活用户数", "分享率", "结算播放份额", "数据日期"]""", + "question":"周洁轮去年发布的歌曲有哪些", + "prior_schema_links":"""['23109'->MPPM歌手ID]""", + "analysis": """让我们一步一步地思考。在问题“周洁轮去年发布的歌曲有哪些“中,我们被问: +“歌曲有哪些”,所以我们需要column=[歌曲名] +”去年发布的“,所以我们需要column=[发布时间] +”周洁轮“,所以我们需要column=[歌手名] +基于table和columns,可能的cell values 是 = ['周洁轮', 1]。""", + "schema_links":"""["歌曲名", "发布时间", "歌手名", 1, "'周洁轮'"]""", + "sql":"""select 歌曲名 from 歌曲库 where datediff('year', 发布时间, '2023-09-18') <= 1 and 歌手名 = '周洁轮' and 数据日期 = '2023-09-18' """ + }, + { "current_date":"2023-09-11", + "table_name":"艺人库", + "fields_list":"""["播放量层级", "播放量单调性", "播放量方差", "播放量突增类型", "播放量集中度", "歌手名", "歌手等级", "歌手类型", "歌手来源", "签约日期", "MPPM潮流人等级", "结算播放量", "运营播放量", "历史累计结算播放量", "有播放量歌曲数", "历史累计运营播放量", "付费用户结算播放量", "结算播放量占比", "运营播放份额", "免费用户结算播放占比", "完播量", "数据日期"]""", + "question":"我想要近半年签约的播放量前十的歌手有哪些", + "prior_schema_links":"""[]""", + "analysis": """让我们一步一步地思考。在问题“我想要近半年签约的播放量前十的歌手“中,我们被问: +“歌手有哪些”,所以我们需要column=[歌手名] +”播放量前十的“,所以我们需要column=[结算播放量] +”近半年签约的“,所以我们需要column=[签约日期] +基于table和columns,可能的cell values 是 = [0.5, 10]。""", + "schema_links":"""["歌手名", "结算播放量", "签约日期", 0.5, 10]""", + "sql":"""select 歌手名 from 艺人库 where datediff('year', 签约日期, '2023-09-11') <= 0.5 and 数据日期 = '2023-09-11' order by 结算播放量 desc limit 10""" + }, + { "current_date":"2023-08-12", + "table_name":"歌曲库", + "fields_list": """["发行日期", "歌曲语言", "歌曲来源", "歌曲流派", "歌曲名", "歌曲版本", "歌曲类型", "发行时间", "数据日期"]""", + "question":"最近一年发行的歌曲中,有哪些在近7天播放超过一千万的", + "prior_schema_links":"""[]""", + "analysis": """让我们一步一步地思考。在问题“最近一年发行的歌曲中,有哪些在近7天播放超过一千万的“中,我们被问: +“发行的歌曲中,有哪些”,所以我们需要column=[歌曲名] +”最近一年发行的“,所以我们需要column=[发行日期] +”在近7天播放超过一千万的“,所以我们需要column=[数据日期, 结算播放量] +基于table和columns,可能的cell values 是 = [1, 10000000]""", + "schema_links":"""["歌曲名", "发行日期", "数据日期", "结算播放量", 1, 10000000]""", + "sql":"""select 歌曲名 from 歌曲库 where datediff('year', 发行日期, '2023-08-12') <= 1 and datediff('day', 数据日期, '2023-08-12') <= 7 and 结算播放量 > 10000000""" + }, + { "current_date":"2023-08-12", + "table_name":"歌曲库", + "fields_list": """["发行日期", "歌曲语言", "歌曲来源", "歌曲流派", "歌曲名", "歌曲版本", "歌曲类型", "发行时间", "数据日期"]""", + "question":"今年以来发行的歌曲中,有哪些在近7天播放超过一千万的", + "prior_schema_links":"""[]""", + "analysis": """让我们一步一步地思考。在问题“今年以来发行的歌曲中,有哪些在近7天播放超过一千万的“中,我们被问: +“发行的歌曲中,有哪些”,所以我们需要column=[歌曲名] +”今年以来发行的“,所以我们需要column=[发行日期] +”在近7天播放超过一千万的“,所以我们需要column=[数据日期, 结算播放量] +基于table和columns,可能的cell values 是 = [0, 7, 10000000]""", + "schema_links":"""["歌曲名", "发行日期", "数据日期", "结算播放量", 0, 7, 10000000]""", + "sql":"""select 歌曲名 from 歌曲库 where datediff('year', 发行日期, '2023-08-12') <= 0 and datediff('day', 数据日期, '2023-08-12') <= 7 and 结算播放量 > 10000000""" + }, + { "current_date":"2023-08-12", + "table_name":"歌曲库", + "fields_list": """["发行日期", "歌曲语言", "歌曲来源", "歌曲流派", "歌曲名", "歌曲版本", "歌曲类型", "发行时间", "数据日期"]""", + "question":"2023年以来发行的歌曲中,有哪些在近7天播放超过一千万的", + "prior_schema_links":"""['514129144'->MPPM歌曲ID]""", + "analysis": """让我们一步一步地思考。在问题“2023年以来发行的歌曲中,有哪些在近7天播放超过一千万的“中,我们被问: +“发行的歌曲中,有哪些”,所以我们需要column=[歌曲名] +”2023年以来发行的“,所以我们需要column=[发行日期] +”在近7天播放超过一千万的“,所以我们需要column=[数据日期, 结算播放量] +基于table和columns,可能的cell values 是 = [2023, 7, 10000000]""", + "schema_links":"""["歌曲名", "发行日期", "数据日期", "结算播放量", 2023, 7, 10000000]""", + "sql":"""select 歌曲名 from 歌曲库 where YEAR(发行日期) >= 2023 and datediff('day', 数据日期, '2023-08-12') <= 7 and 结算播放量 > 10000000""" + }, + { "current_date":"2023-08-01", + "table_name":"歌曲库", + "fields_list":"""["歌曲名", "歌曲版本", "歌手名", "歌曲类型", "发布时间", "MPPM歌曲ID", "是否严选窄口径歌曲", "是否严选宽口径歌曲", "是否潮流人歌曲", "超声波歌曲ID", "C音歌曲ID", "C音歌曲MID", "结算播放量", "运营播放量", "分享量", "收藏量", "运营搜播量", "结算搜播量", "拉新用户数", "拉活用户数", "分享率", "结算播放份额", "数据日期"]""", + "question":"周洁轮2023年6月之后发布的歌曲有哪些", + "prior_schema_links":"""['23109'->MPPM歌手ID]""", + "analysis": """让我们一步一步地思考。在问题“周洁轮2023年6月之后发布的歌曲有哪些“中,我们被问: +“歌曲有哪些”,所以我们需要column=[歌曲名] +”2023年6月之后发布的“,所以我们需要column=[发布时间] +”周洁轮“,所以我们需要column=[歌手名] +基于table和columns,可能的cell values 是 = ['周洁轮', 2023, 6]。""", + "schema_links":"""["歌曲名", "发布时间", "歌手名", "周洁轮", 2023, 6]""", + "sql":"""select 歌曲名 from 歌曲库 where YEAR(发布时间) >= 2023 and MONTH(发布时间) >= 6 and 歌手名 = '周洁轮' and 数据日期 = '2023-08-01' """ + }, + { "current_date":"2023-08-01", + "table_name":"歌曲库", + "fields_list":"""["歌曲名", "歌曲版本", "歌手名", "歌曲类型", "发布时间", "MPPM歌曲ID", "是否严选窄口径歌曲", "是否严选宽口径歌曲", "是否潮流人歌曲", "超声波歌曲ID", "C音歌曲ID", "C音歌曲MID", "结算播放量", "运营播放量", "分享量", "收藏量", "运营搜播量", "结算搜播量", "拉新用户数", "拉活用户数", "分享率", "结算播放份额", "数据日期"]""", + "question":"邓梓琦在2023年1月5日之后发布的歌曲中,有哪些播放量大于500W的?", + "prior_schema_links":"""['2312311'->MPPM歌手ID]""", + "analysis": """让我们一步一步地思考。在问题“邓梓琦在2023年1月5日之后发布的歌曲中,有哪些播放量大于500W的?“中,我们被问: +“播放量大于500W的”,所以我们需要column=[结算播放量] +”邓梓琦在2023年1月5日之后发布的“,所以我们需要column=[发布时间] +”邓梓琦“,所以我们需要column=[歌手名] +基于table和columns,可能的cell values 是 = ['邓梓琦', 2023, 1, 5, 5000000]。""", + "schema_links":"""["结算播放量", "发布时间", "歌手名", "邓梓琦", 2023, 1, 5, 5000000]""", + "sql":"""select 歌曲名 from 歌曲库 where YEAR(发布时间) >= 2023 and MONTH(发布时间) >= 1 and DAY(发布时间) >= 5 and 歌手名 = '邓梓琦' and 结算播放量 > 5000000 and 数据日期 = '2023-08-01'""" + }, + { "current_date":"2023-09-17", + "table_name":"歌曲库", + "fields_list":"""["歌曲名", "歌曲版本", "歌手名", "歌曲类型", "发布时间", "MPPM歌曲ID", "是否严选窄口径歌曲", "是否严选宽口径歌曲", "是否潮流人歌曲", "超声波歌曲ID", "C音歌曲ID", "C音歌曲MID", "结算播放量", "运营播放量", "分享量", "收藏量", "运营搜播量", "结算搜播量", "拉新用户数", "拉活用户数", "分享率", "结算播放份额", "数据日期"]""", + "question":"2023年6月以后,张亮英播放量大于200万的歌曲有哪些?", + "prior_schema_links":"""['45453'->MPPM歌手ID]""", + "analysis": """让我们一步一步地思考。在问题“2023年6月以后,张亮英播放量大于200万的歌曲有哪些?“中,我们被问: +“播放量大于200万的”,所以我们需要column=[结算播放量] +”2023年6月以后,张亮英“,所以我们需要column=[数据日期, 歌手名] +”歌曲有哪些“,所以我们需要column=[歌曲名] +基于table和columns,可能的cell values 是 = ['张亮英', 2023, 6, 2000000]。""", + "schema_links":"""["结算播放量", "数据日期", "歌手名", "张亮英", 2023, 6, 2000000]""", + "sql":"""select 歌曲名 from 歌曲库 where YEAR(数据日期) >= 2023 and MONTH(数据日期) >= 6 and 歌手名 = '张亮英' and 结算播放量 > 2000000 """ + }, + { "current_date":"2023-08-16", + "table_name":"歌曲库", + "fields_list":"""["歌曲名", "歌曲版本", "歌手名", "歌曲类型", "发布时间", "MPPM歌曲ID", "是否严选窄口径歌曲", "是否严选宽口径歌曲", "是否潮流人歌曲", "超声波歌曲ID", "C音歌曲ID", "C音歌曲MID", "结算播放量", "运营播放量", "分享量", "收藏量", "运营搜播量", "结算搜播量", "拉新用户数", "拉活用户数", "分享率", "结算播放份额", "数据日期"]""", + "question":"2021年6月以后发布的李雨纯的播放量大于20万的歌曲有哪些", + "prior_schema_links":"""['23109'->MPPM歌手ID]""", + "analysis": """让我们一步一步地思考。在问题“2021年6月以后发布的李雨纯的播放量大于20万的歌曲有哪些“中,我们被问: +“播放量大于20万的”,所以我们需要column=[结算播放量] +”2021年6月以后发布的“,所以我们需要column=[发布时间] +”李雨纯“,所以我们需要column=[歌手名] +基于table和columns,可能的cell values 是 = ['李雨纯', 2021, 6, 200000]。""", + "schema_links":"""["结算播放量", "发布时间", "歌手名", "李雨纯", 2021, 6, 200000]""", + "sql":"""select 歌曲名 from 歌曲库 where YEAR(发布时间) >= 2021 and MONTH(发布时间) >= 6 and 歌手名 = '李雨纯' and 结算播放量 > 200000 and 数据日期 = '2023-08-16'""" + }, + { "current_date":"2023-08-16", + "table_name":"歌曲库", + "fields_list":"""["歌曲名", "歌曲版本", "歌手名", "歌曲类型", "发布时间", "MPPM歌曲ID", "是否严选窄口径歌曲", "是否严选宽口径歌曲", "是否潮流人歌曲", "超声波歌曲ID", "C音歌曲ID", "C音歌曲MID", "结算播放量", "运营播放量", "分享量", "收藏量", "运营搜播量", "结算搜播量", "拉新用户数", "拉活用户数", "分享率", "结算播放份额", "数据日期"]""", + "question":"刘锝桦在1992年4月2日到2020年5月2日之间发布的播放量大于20万的歌曲有哪些", + "prior_schema_links":"""['4234234'->MPPM歌手ID]""", + "analysis": """让我们一步一步地思考。在问题“刘锝桦在1992年4月2日到2020年5月2日之间发布的播放量大于20万的歌曲有哪些“中,我们被问: +“播放量大于20万的”,所以我们需要column=[结算播放量] +”1992年4月2日到2020年5月2日之间发布的“,所以我们需要column=[发布时间] +”刘锝桦“,所以我们需要column=[歌手名] +基于table和columns,可能的cell values 是 = ['刘锝桦', 1992, 4, 2, 2020, 5, 2, 200000]。""", + "schema_links":"""["结算播放量", "发布时间", "歌手名", "刘锝桦", 1992, 4, 2, 2020, 5, 2, 200000]""", + "sql":"""select 歌曲名 from 歌曲库 where YEAR(发布时间) >= 1992 and MONTH(发布时间) >= 4 and DAY(发布时间) >= 2 and YEAR(发布时间) <= 2020 and MONTH(发布时间) <= 5 and DAY(发布时间) <= 2 and 歌手名 = '刘锝桦' and 结算播放量 > 200000 and 数据日期 = '2023-08-16'""" + } ] \ No newline at end of file diff --git a/chat/core/src/main/python/llm/preset_retrieval/run.py b/chat/core/src/main/python/llm/preset_retrieval/run.py index 9027253bf..dc501b49c 100644 --- a/chat/core/src/main/python/llm/preset_retrieval/run.py +++ b/chat/core/src/main/python/llm/preset_retrieval/run.py @@ -8,8 +8,7 @@ from typing import Any, List, Mapping, Optional, Union sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) sys.path.append(os.path.dirname(os.path.abspath(__file__))) -import chromadb -from chromadb.config import Settings + from chromadb.api import Collection, Documents, Embeddings from langchain.llms import OpenAI @@ -21,13 +20,9 @@ from preset_query_db import (get_ids, add2preset_query_collection, from util.text2vec import Text2VecEmbeddingFunction from run_config import CHROMA_DB_PERSIST_PATH, PRESET_QUERY_COLLECTION_NAME +from util.chromadb_instance import client -client = chromadb.Client(Settings( - chroma_db_impl="duckdb+parquet", - persist_directory=CHROMA_DB_PERSIST_PATH # Optional, defaults to .chromadb/ in the current directory -)) - emb_func = Text2VecEmbeddingFunction() collection = client.get_or_create_collection(name=PRESET_QUERY_COLLECTION_NAME, @@ -35,6 +30,8 @@ collection = client.get_or_create_collection(name=PRESET_QUERY_COLLECTION_NAME, metadata={"hnsw:space": "cosine"} ) # Get a collection object from an existing collection, by name. If it doesn't exist, create it. +print("init_preset_query_collection_size: ", preset_query_collection_size(collection)) + def preset_query_retrieval_run(collection:Collection, query_texts_list:List[str], n_results:int=5): retrieval_res = query2preset_query_collection(collection=collection, diff --git a/chat/core/src/main/python/llm/run_config.py b/chat/core/src/main/python/llm/run_config.py index 989b44e5a..e2b47b404 100644 --- a/chat/core/src/main/python/llm/run_config.py +++ b/chat/core/src/main/python/llm/run_config.py @@ -9,6 +9,7 @@ TEMPERATURE = 0.0 CHROMA_DB_PERSIST_DIR = 'chm_db' PRESET_QUERY_COLLECTION_NAME = "preset_query_collection" +TEXT2DSL_COLLECTION_NAME = "text2dsl_collection" CHROMA_DB_PERSIST_PATH = os.path.join(PROJECT_DIR_PATH, CHROMA_DB_PERSIST_DIR) diff --git a/chat/core/src/main/python/llm/sql/constructor.py b/chat/core/src/main/python/llm/sql/constructor.py new file mode 100644 index 000000000..c6f367492 --- /dev/null +++ b/chat/core/src/main/python/llm/sql/constructor.py @@ -0,0 +1,53 @@ +# -*- coding:utf-8 -*- +import os +import sys + +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +sys.path.append(os.path.dirname(os.path.abspath(__file__))) + +from langchain.prompts.few_shot import FewShotPromptTemplate +from langchain.prompts import PromptTemplate +from langchain.vectorstores import Chroma +from langchain.embeddings import OpenAIEmbeddings, HuggingFaceEmbeddings +from langchain.prompts.example_selector import SemanticSimilarityExampleSelector + +import chromadb +from chromadb.config import Settings + +from few_shot_example.sql_exampler import examplars as din_sql_examplars +from util.text2vec import Text2VecEmbeddingFunction, hg_embedding +from util.chromadb_instance import client as chromadb_client + + +from run_config import TEXT2DSL_COLLECTION_NAME + + +vectorstore = Chroma(collection_name=TEXT2DSL_COLLECTION_NAME, + embedding_function=hg_embedding, + client=chromadb_client) + +example_nums = 15 + +schema_linking_example_selector = SemanticSimilarityExampleSelector(vectorstore=vectorstore, k=example_nums, + input_keys=["question"], + example_keys=["table_name", "fields_list", "prior_schema_links", "question", "analysis", "schema_links"]) + +sql_example_selector = SemanticSimilarityExampleSelector(vectorstore=vectorstore, k=example_nums, + input_keys=["question"], + example_keys=["question", "current_date", "table_name", "schema_links", "sql"]) + +if vectorstore._collection.count() > 0: + print("examples already in din_sql_vectorstore") + print("init din_sql_vectorstore size:", vectorstore._collection.count()) + if vectorstore._collection.count() < len(din_sql_examplars): + print("din_sql_examplars size:", len(din_sql_examplars)) + vectorstore._collection.delete() + print("empty din_sql_vectorstore") + for example in din_sql_examplars: + schema_linking_example_selector.add_example(example) + print("added din_sql_vectorstore size:", vectorstore._collection.count()) +else: + for example in din_sql_examplars: + schema_linking_example_selector.add_example(example) + + print("added din_sql_vectorstore size:", vectorstore._collection.count()) diff --git a/chat/core/src/main/python/llm/sql/output_parser.py b/chat/core/src/main/python/llm/sql/output_parser.py index 64df5ba1f..c90388850 100644 --- a/chat/core/src/main/python/llm/sql/output_parser.py +++ b/chat/core/src/main/python/llm/sql/output_parser.py @@ -1,15 +1,13 @@ # -*- coding:utf-8 -*- import re - def schema_link_parse(schema_link_output): - try: - schema_link_output = schema_link_output.strip() - pattern = r'Schema_links:(.*)' - schema_link_output = re.findall(pattern, schema_link_output, re.DOTALL)[ - 0].strip() - except Exception as e: - print(e) - schema_link_output = None + try: + schema_link_output = schema_link_output.strip() + pattern = r'Schema_links:(.*)' + schema_link_output = re.findall(pattern, schema_link_output, re.DOTALL)[0].strip() + except Exception as e: + print(e) + schema_link_output = None - return schema_link_output + return schema_link_output \ No newline at end of file diff --git a/chat/core/src/main/python/llm/sql/prompt_maker.py b/chat/core/src/main/python/llm/sql/prompt_maker.py index 6e05f95b3..0cfed83b1 100644 --- a/chat/core/src/main/python/llm/sql/prompt_maker.py +++ b/chat/core/src/main/python/llm/sql/prompt_maker.py @@ -1,8 +1,5 @@ # -*- coding:utf-8 -*- from typing import Any, List, Mapping, Optional, Union -import requests -import logging -import json import os import sys @@ -11,78 +8,68 @@ sys.path.append(os.path.dirname(os.path.abspath(__file__))) from langchain.prompts import PromptTemplate from langchain.prompts.few_shot import FewShotPromptTemplate -from langchain.llms import OpenAI - -from few_shot_example.sql_exampler import examplars -from output_parser import schema_link_parse - - -def schema_linking_prompt_maker(user_query: str, model_name: str, - fields_list: List[str], - few_shots_example: str): - instruction = "# 根据数据库的表结构,找出为每个问题生成SQL查询语句的schema_links\n" - - schema_linking_prompt = "Table {table_name}, columns = {fields_list}\n问题:{user_query}\n分析: 让我们一步一步地思考。".format( - table_name=model_name, - fields_list=fields_list, - user_query=user_query) - - return instruction + few_shots_example + schema_linking_prompt +from langchain.prompts.example_selector import SemanticSimilarityExampleSelector def schema_linking_exampler(user_query: str, - model_name: str, - fields_list: List[str] -) -> str: - example_prompt_template = PromptTemplate( - input_variables=["table_name", "fields_list", "question", "analysis", - "schema_links"], - template="Table {table_name}, columns = {fields_list}\n问题:{question}\n分析:{analysis} 所以Schema_links是:\nSchema_links:{schema_links}") + domain_name: str, + fields_list: List[str], + prior_schema_links: Mapping[str,str], + example_selector: SemanticSimilarityExampleSelector, + ) -> str: - instruction = "# 根据数据库的表结构,找出为每个问题生成SQL查询语句的schema_links" + prior_schema_links_str = '['+ ','.join(["""'{}'->{}""".format(k,v) for k,v in prior_schema_links.items()]) + ']' - schema_linking_prompt = "Table {table_name}, columns = {fields_list}\n问题:{question}\n分析: 让我们一步一步地思考。" + example_prompt_template = PromptTemplate(input_variables=["table_name", "fields_list", "prior_schema_links", "question", "analysis", "schema_links"], + template="Table {table_name}, columns = {fields_list}, prior_schema_links = {prior_schema_links}\n问题:{question}\n分析:{analysis} 所以Schema_links是:\nSchema_links:{schema_links}") - schema_linking_example_prompt_template = FewShotPromptTemplate( - examples=examplars, - example_prompt=example_prompt_template, - example_separator="\n\n", - prefix=instruction, - input_variables=["table_name", "fields_list", "question"], - suffix=schema_linking_prompt - ) + instruction = "# 根据数据库的表结构,参考先验信息,找出为每个问题生成SQL查询语句的schema_links" - schema_linking_example_prompt = schema_linking_example_prompt_template.format( - table_name=model_name, - fields_list=fields_list, - question=user_query) + schema_linking_prompt = "Table {table_name}, columns = {fields_list}, prior_schema_links = {prior_schema_links}\n问题:{question}\n分析: 让我们一步一步地思考。" - return schema_linking_example_prompt + schema_linking_example_prompt_template = FewShotPromptTemplate( + example_selector=example_selector, + example_prompt=example_prompt_template, + example_separator="\n\n", + prefix=instruction, + input_variables=["table_name", "fields_list", "prior_schema_links", "question"], + suffix=schema_linking_prompt + ) + + schema_linking_example_prompt = schema_linking_example_prompt_template.format(table_name=domain_name, + fields_list=fields_list, + prior_schema_links=prior_schema_links_str, + question=user_query) + + return schema_linking_example_prompt def sql_exampler(user_query: str, - model_name: str, - schema_link_str: str -) -> str: - instruction = "# 根据schema_links为每个问题生成SQL查询语句" + domain_name: str, + schema_link_str: str, + data_date: str, + example_selector: SemanticSimilarityExampleSelector, + ) -> str: + + instruction = "# 根据schema_links为每个问题生成SQL查询语句" - sql_example_prompt_template = PromptTemplate( - input_variables=["question", "table_name", "schema_links", "sql"], - template="问题:{question}\nTable {table_name}\nSchema_links:{schema_links}\nSQL:{sql}") + sql_example_prompt_template = PromptTemplate(input_variables=["question", "current_date", "table_name", "schema_links", "sql"], + template="问题:{question}\nCurrent_date:{current_date}\nTable {table_name}\nSchema_links:{schema_links}\nSQL:{sql}") - sql_prompt = "问题:{question}\nTable {table_name}\nSchema_links:{schema_links}\nSQL:" + sql_prompt = "问题:{question}\nCurrent_date:{current_date}\nTable {table_name}\nSchema_links:{schema_links}\nSQL:" - sql_example_prompt_template = FewShotPromptTemplate( - examples=examplars, - example_prompt=sql_example_prompt_template, - example_separator="\n\n", - prefix=instruction, - input_variables=["question", "table_name", "schema_links"], - suffix=sql_prompt - ) + sql_example_prompt_template = FewShotPromptTemplate( + example_selector=example_selector, + example_prompt=sql_example_prompt_template, + example_separator="\n\n", + prefix=instruction, + input_variables=["question", "current_date", "table_name", "schema_links"], + suffix=sql_prompt + ) - sql_example_prompt = sql_example_prompt_template.format(question=user_query, - table_name=model_name, - schema_links=schema_link_str) + sql_example_prompt = sql_example_prompt_template.format(question=user_query, + current_date=data_date, + table_name=domain_name, + schema_links=schema_link_str) - return sql_example_prompt + return sql_example_prompt diff --git a/chat/core/src/main/python/llm/sql/run.py b/chat/core/src/main/python/llm/sql/run.py index ea60d7f36..34919799b 100644 --- a/chat/core/src/main/python/llm/sql/run.py +++ b/chat/core/src/main/python/llm/sql/run.py @@ -1,6 +1,4 @@ -# -*- coding:utf-8 -*- - -from typing import List, Union +from typing import List, Union, Mapping import logging import json import os @@ -9,33 +7,54 @@ import sys sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) sys.path.append(os.path.dirname(os.path.abspath(__file__))) -from sql.prompt_maker import schema_linking_exampler, schema_link_parse, \ - sql_exampler +from sql.prompt_maker import schema_linking_exampler, sql_exampler +from sql.constructor import schema_linking_example_selector, sql_example_selector +from sql.output_parser import schema_link_parse from util.llm_instance import llm -def query2sql(query_text: str, schema: dict): - print("schema: ", schema) - model_name = schema['modelName'] - fields_list = schema['fieldNameList'] +def query2sql(query_text: str, + schema : Union[dict, None] = None, + current_date: str = None, + linking: Union[List[Mapping[str, str]], None] = None + ): + + print("query_text: ", query_text) + print("schema: ", schema) + print("current_date: ", current_date) + print("prior_schema_links: ", linking) - schema_linking_prompt = schema_linking_exampler(query_text, model_name, - fields_list) - schema_link_output = llm(schema_linking_prompt) - schema_link_str = schema_link_parse(schema_link_output) + if linking is not None: + prior_schema_links = {item['fieldValue']:item['fieldName'] for item in linking} + else: + prior_schema_links = {} - sql_prompt = sql_exampler(query_text, model_name, schema_link_str) - sql_output = llm(sql_prompt) + model_name = schema['modelName'] + fields_list = schema['fieldNameList'] - resp = dict() - resp['query'] = query_text - resp['model'] = model_name - resp['fields'] = fields_list + schema_linking_prompt = schema_linking_exampler(query_text, model_name, fields_list, prior_schema_links, schema_linking_example_selector) + print("schema_linking_prompt->", schema_linking_prompt) + schema_link_output = llm(schema_linking_prompt) + schema_link_str = schema_link_parse(schema_link_output) + + sql_prompt = sql_exampler(query_text, model_name, schema_link_str, current_date, sql_example_selector) + print("sql_prompt->", sql_prompt) + sql_output = llm(sql_prompt) - resp['schemaLinkingOutput'] = schema_link_output - resp['schemaLinkStr'] = schema_link_str + resp = dict() + resp['query'] = query_text + resp['model'] = model_name + resp['fields'] = fields_list + resp['priorSchemaLinking'] = linking + resp['dataDate'] = current_date - resp['sqlOutput'] = sql_output + resp['schemaLinkingOutput'] = schema_link_output + resp['schemaLinkStr'] = schema_link_str + + resp['sqlOutput'] = sql_output + + print("resp: ", resp) + + return resp - return resp diff --git a/chat/core/src/main/python/llm/util/chromadb_instance.py b/chat/core/src/main/python/llm/util/chromadb_instance.py new file mode 100644 index 000000000..f0fe6ce01 --- /dev/null +++ b/chat/core/src/main/python/llm/util/chromadb_instance.py @@ -0,0 +1,10 @@ +# -*- coding:utf-8 -*- +import chromadb +from chromadb.config import Settings + +from run_config import CHROMA_DB_PERSIST_PATH + +client = chromadb.Client(Settings( + chroma_db_impl="duckdb+parquet", + persist_directory=CHROMA_DB_PERSIST_PATH # Optional, defaults to .chromadb/ in the current directory +)) \ No newline at end of file diff --git a/launchers/standalone/src/main/java/com/tencent/supersonic/ConfigureDemo.java b/launchers/standalone/src/main/java/com/tencent/supersonic/ConfigureDemo.java index a12be17e9..544e02ee0 100644 --- a/launchers/standalone/src/main/java/com/tencent/supersonic/ConfigureDemo.java +++ b/launchers/standalone/src/main/java/com/tencent/supersonic/ConfigureDemo.java @@ -220,6 +220,7 @@ public class ConfigureDemo implements ApplicationListener AgentConfig agentConfig = new AgentConfig(); RuleQueryTool ruleQueryTool = new RuleQueryTool(); ruleQueryTool.setType(AgentToolType.RULE); + ruleQueryTool.setId("0"); ruleQueryTool.setModelIds(Lists.newArrayList(-1L)); ruleQueryTool.setQueryModes(Lists.newArrayList( "METRIC_ENTITY", "METRIC_FILTER", "METRIC_GROUPBY", @@ -228,6 +229,7 @@ public class ConfigureDemo implements ApplicationListener agentConfig.getTools().add(ruleQueryTool); DslTool dslTool = new DslTool(); + dslTool.setId("1"); dslTool.setType(AgentToolType.DSL); dslTool.setModelIds(Lists.newArrayList(-1L)); agentConfig.getTools().add(dslTool); @@ -246,6 +248,7 @@ public class ConfigureDemo implements ApplicationListener agent.setExamples(Lists.newArrayList("国风风格艺人", "港台地区的艺人", "风格为流行的艺人")); AgentConfig agentConfig = new AgentConfig(); RuleQueryTool ruleQueryTool = new RuleQueryTool(); + ruleQueryTool.setId("0"); ruleQueryTool.setType(AgentToolType.RULE); ruleQueryTool.setModelIds(Lists.newArrayList(-1L)); ruleQueryTool.setQueryModes(Lists.newArrayList( @@ -253,6 +256,7 @@ public class ConfigureDemo implements ApplicationListener agentConfig.getTools().add(ruleQueryTool); DslTool dslTool = new DslTool(); + dslTool.setId("1"); dslTool.setType(AgentToolType.DSL); dslTool.setModelIds(Lists.newArrayList(-1L)); agentConfig.getTools().add(dslTool);