From 41aa6ff8e40882cf4ca114b4a375c44ccc2276ac Mon Sep 17 00:00:00 2001 From: codescracker Date: Thu, 9 Nov 2023 11:46:56 +0800 Subject: [PATCH] add api service for sql_agent for crud opereations of few-shots examples. (#342) --- .../{sql_exampler.py => sql_examplar.py} | 313 +++++++++--------- .../main/python/services/sql/constructor.py | 44 ++- .../services/sql/examples_reload_run.py | 6 +- chat/core/src/main/python/services/sql/run.py | 10 +- .../src/main/python/services/sql/sql_agent.py | 113 ++++--- .../services_router/query2sql_service.py | 77 ++++- 6 files changed, 342 insertions(+), 221 deletions(-) rename chat/core/src/main/python/few_shot_example/{sql_exampler.py => sql_examplar.py} (57%) diff --git a/chat/core/src/main/python/few_shot_example/sql_exampler.py b/chat/core/src/main/python/few_shot_example/sql_examplar.py similarity index 57% rename from chat/core/src/main/python/few_shot_example/sql_exampler.py rename to chat/core/src/main/python/few_shot_example/sql_examplar.py index 1c2fd669f..8d5f3a886 100644 --- a/chat/core/src/main/python/few_shot_example/sql_exampler.py +++ b/chat/core/src/main/python/few_shot_example/sql_examplar.py @@ -1,361 +1,374 @@ examplars= [ - { "current_date":"2020-12-01", - "table_name":"内容库产品", - "fields_list":"""["部门", "模块", "用户名", "访问次数", "访问人数", "访问时长", "数据日期"]""", + { "currentDate":"2020-12-01", + "tableName":"内容库产品", + "fieldsList":"""["部门", "模块", "用户名", "访问次数", "访问人数", "访问时长", "数据日期"]""", "question":"比较jackjchen和robinlee在内容库的访问次数", - "prior_schema_links":"""['jackjchen'->用户名, 'robinlee'->用户名]""", + "priorSchemaLinks":"""['jackjchen'->用户名, 'robinlee'->用户名]""", "analysis": """让我们一步一步地思考。在问题“比较jackjchen和robinlee在内容库的访问次数“中,我们被问: “比较jackjchen和robinlee”,所以我们需要column=[用户名],cell values = ['jackjchen', 'robinlee'],所以有[用户名:('jackjchen', 'robinlee')] ”内容库的访问次数“,所以我们需要column=[访问次数]""", - "schema_links":"""["用户名":("'jackjchen'", "'robinlee'"), "访问次数"]""", + "schemaLinks":"""["用户名":("'jackjchen'", "'robinlee'"), "访问次数"]""", "sql":"""select 用户名, 访问次数 from 内容库产品 where 用户名 in ('jackjchen', 'robinlee')""" }, - { "current_date":"2022-11-06", - "table_name":"内容库产品", - "fields_list":"""["部门", "模块", "用户名", "访问次数", "访问人数", "访问时长", "数据日期"]""", + { "currentDate":"2022-11-06", + "tableName":"内容库产品", + "fieldsList":"""["部门", "模块", "用户名", "访问次数", "访问人数", "访问时长", "数据日期"]""", "question":"内容库近12个月访问人数 按部门", - "prior_schema_links":"""[]""", + "priorSchemaLinks":"""[]""", "analysis": """让我们一步一步地思考。在问题“内容库近12个月访问人数 按部门“中,我们被问: ”内容库近12个月“,所以我们需要column=[数据日期],cell values = [12],所以有[数据日期:(12)] “访问人数”,所以我们需要column=[访问人数] ”按部门“,所以我们需要column=[部门]""", - "schema_links":"""["数据日期":(12), "访问人数", "部门"]""", + "schemaLinks":"""["数据日期":(12), "访问人数", "部门"]""", "sql":"""select 部门, 数据日期, 访问人数 from 内容库产品 where datediff('month', 数据日期, '2022-11-06') <= 12 """ }, - { "current_date":"2023-04-21", - "table_name":"内容库产品", - "fields_list":"""["部门", "模块", "用户名", "访问次数", "访问人数", "访问时长", "数据日期"]""", + { "currentDate":"2023-04-21", + "tableName":"内容库产品", + "fieldsList":"""["部门", "模块", "用户名", "访问次数", "访问人数", "访问时长", "数据日期"]""", "question":"内容库美术部、技术研发部的访问时长", - "prior_schema_links":"""['美术部'->部门, '技术研发部'->部门]""", + "priorSchemaLinks":"""['美术部'->部门, '技术研发部'->部门]""", "analysis": """让我们一步一步地思考。在问题“内容库美术部、技术研发部的访问时长“中,我们被问: “访问时长”,所以我们需要column=[访问时长] ”内容库美术部、技术研发部“,所以我们需要column=[部门], cell values = ['美术部', '技术研发部'],所以有[部门:('美术部', '技术研发部')]""", - "schema_links":"""["访问时长", "部门":("'美术部'", "'技术研发部'")]""", + "schemaLinks":"""["访问时长", "部门":("'美术部'", "'技术研发部'")]""", "sql":"""select 部门, 访问时长 from 内容库产品 where 部门 in ('美术部', '技术研发部')""" }, - { "current_date":"2023-08-21", - "table_name":"严选", - "fields_list":"""["严选版权归属系", "付费模式", "结算播放份额", "付费用户结算播放份额", "数据日期"]""", + { "currentDate":"2023-08-21", + "tableName":"严选", + "fieldsList":"""["严选版权归属系", "付费模式", "结算播放份额", "付费用户结算播放份额", "数据日期"]""", "question":"近3天海田飞系MPPM结算播放份额", - "prior_schema_links":"""['海田飞系'->严选版权归属系]""", + "priorSchemaLinks":"""['海田飞系'->严选版权归属系]""", "analysis": """让我们一步一步地思考。在问题“近3天海田飞系MPPM结算播放份额“中,我们被问: “MPPM结算播放份额”,所以我们需要column=[结算播放份额], ”海田飞系“,所以我们需要column=[严选版权归属系], cell values = ['海田飞系'],所以有[严选版权归属系:('海田飞系')], ”近3天“,所以我们需要column=[数据日期], cell values = [3],所以有[数据日期:(3)]""", - "schema_links":"""["结算播放份额", "严选版权归属系":("'海田飞系'"), "数据日期":(3)]""", + "schemaLinks":"""["结算播放份额", "严选版权归属系":("'海田飞系'"), "数据日期":(3)]""", "sql":"""select 严选版权归属系, 结算播放份额 from 严选 where 严选版权归属系 = '海田飞系' and datediff('day', 数据日期, '2023-08-21') <= 3 """ }, - { "current_date":"2023-05-22", - "table_name":"歌曲库", - "fields_list":"""["是否潮流人歌曲", "C音歌曲ID", "C音歌曲MID", "歌曲名", "歌曲版本", "语种", "歌曲类型", "翻唱类型", "MPPM歌曲ID", "是否严选窄口径歌曲", "是否严选宽口径歌曲", "结算播放量", "运营播放量", "付费用户结算播放量", "历史累计结算播放量", "运营搜播量", "结算搜播量", "运营完播量", "运营推播量", "近7日复播率", "日均搜播量", "数据日期"]""", + { "currentDate":"2023-05-22", + "tableName":"歌曲库", + "fieldsList":"""["是否潮流人歌曲", "C音歌曲ID", "C音歌曲MID", "歌曲名", "歌曲版本", "语种", "歌曲类型", "翻唱类型", "MPPM歌曲ID", "是否严选窄口径歌曲", "是否严选宽口径歌曲", "结算播放量", "运营播放量", "付费用户结算播放量", "历史累计结算播放量", "运营搜播量", "结算搜播量", "运营完播量", "运营推播量", "近7日复播率", "日均搜播量", "数据日期"]""", "question":"对比近7天翻唱版和纯音乐的歌曲播放量", - "prior_schema_links":"""['纯音乐'->语种, '翻唱版'->歌曲版本]""", + "priorSchemaLinks":"""['纯音乐'->语种, '翻唱版'->歌曲版本]""", "analysis": """让我们一步一步地思考。在问题“对比近3天翻唱版和纯音乐的歌曲播放量“中,我们被问: “歌曲播放量”,所以我们需要column=[结算播放量] ”翻唱版“,所以我们需要column=[歌曲版本], cell values = ['翻唱版'],所以有[歌曲版本:('翻唱版')] ”和纯音乐的歌曲“,所以我们需要column=[语种], cell values = ['纯音乐'],所以有[语种:('纯音乐')] ”近7天“,所以我们需要column=[数据日期], cell values = [7],所以有[数据日期:(7)]""", - "schema_links":"""["结算播放量", "歌曲版本":("'翻唱版'"), "语种":("'纯音乐'"), "数据日期":(7)]""", + "schemaLinks":"""["结算播放量", "歌曲版本":("'翻唱版'"), "语种":("'纯音乐'"), "数据日期":(7)]""", "sql":"""select 歌曲版本, 语种, 结算播放量 from 歌曲库 where 歌曲版本 = '翻唱版' and 语种 = '纯音乐' and datediff('day', 数据日期, '2023-05-22') <= 7 """ }, - { "current_date":"2023-05-31", - "table_name":"艺人库", - "fields_list":"""["上下架状态", "歌手名", "歌手等级", "歌手类型", "歌手来源", "MPPM潮流人等级", "活跃区域", "年龄", "歌手才能", "歌手风格", "粉丝数", "潮音粉丝数", "超声波粉丝数", "推博粉丝数", "超声波歌曲数", "在架歌曲数", "超声波分享数", "独占歌曲数", "超声波在架歌曲评论数", "有播放量歌曲数", "数据日期"]""", + { "currentDate":"2023-05-31", + "tableName":"艺人库", + "fieldsList":"""["上下架状态", "歌手名", "歌手等级", "歌手类型", "歌手来源", "MPPM潮流人等级", "活跃区域", "年龄", "歌手才能", "歌手风格", "粉丝数", "潮音粉丝数", "超声波粉丝数", "推博粉丝数", "超声波歌曲数", "在架歌曲数", "超声波分享数", "独占歌曲数", "超声波在架歌曲评论数", "有播放量歌曲数", "数据日期"]""", "question":"对比一下陈拙悬、孟梅琦、赖媚韵的粉丝数", - "prior_schema_links":"""['1527896'->MPPM歌手ID, '1565463'->MPPM歌手ID, '2141459'->MPPM歌手ID]""", + "priorSchemaLinks":"""['1527896'->MPPM歌手ID, '1565463'->MPPM歌手ID, '2141459'->MPPM歌手ID]""", "analysis": """让我们一步一步地思考。在问题“对比一下陈拙悬、孟梅琦、赖媚韵的粉丝数“中,我们被问: “粉丝数”,所以我们需要column=[粉丝数] ”陈拙悬、孟梅琦、赖媚韵“,所以我们需要column=[歌手名], cell values = ['陈拙悬', '孟梅琦', '赖媚韵'],所以有[歌手名:('陈拙悬', '孟梅琦', '赖媚韵')]""", - "schema_links":"""["粉丝数", "歌手名":("'陈拙悬'", "'孟梅琦'", "'赖媚韵'")]""", + "schemaLinks":"""["粉丝数", "歌手名":("'陈拙悬'", "'孟梅琦'", "'赖媚韵'")]""", "sql":"""select 歌手名, 粉丝数 from 艺人库 where 歌手名 in ('陈拙悬', '孟梅琦', '赖媚韵')""" }, - { "current_date":"2023-07-31", - "table_name":"歌曲库", - "fields_list":"""["歌曲名", "歌曲版本", "歌曲类型", "MPPM歌曲ID", "是否严选窄口径歌曲", "是否严选宽口径歌曲", "是否潮流人歌曲", "超声波歌曲ID", "C音歌曲ID", "C音歌曲MID", "结算播放量", "运营播放量", "分享量", "收藏量", "运营搜播量", "结算搜播量", "拉新用户数", "拉活用户数", "分享率", "结算播放份额", "数据日期"]""", + { "currentDate":"2023-07-31", + "tableName":"歌曲库", + "fieldsList":"""["歌曲名", "歌曲版本", "歌曲类型", "MPPM歌曲ID", "是否严选窄口径歌曲", "是否严选宽口径歌曲", "是否潮流人歌曲", "超声波歌曲ID", "C音歌曲ID", "C音歌曲MID", "结算播放量", "运营播放量", "分享量", "收藏量", "运营搜播量", "结算搜播量", "拉新用户数", "拉活用户数", "分享率", "结算播放份额", "数据日期"]""", "question":"播放量大于1万的歌曲有多少", - "prior_schema_links":"""[]""", + "priorSchemaLinks":"""[]""", "analysis": """让我们一步一步地思考。在问题“播放量大于1万的歌曲有多少“中,我们被问: “歌曲有多少”,所以我们需要column=[歌曲名] ”播放量大于1万的“,所以我们需要column=[结算播放量], cell values = [10000],所以有[结算播放量:(10000)]""", - "schema_links":"""["歌曲名", "结算播放量":(10000)]""", + "schemaLinks":"""["歌曲名", "结算播放量":(10000)]""", "sql":"""select 歌曲名 from 歌曲库 where 结算播放量 > 10000""" }, - { "current_date":"2023-07-31", - "table_name":"内容库产品", - "fields_list":"""["用户名", "部门", "模块", "访问时长", "访问次数", "访问人数", "数据日期"]""", + { "currentDate":"2023-07-31", + "tableName":"内容库产品", + "fieldsList":"""["用户名", "部门", "模块", "访问时长", "访问次数", "访问人数", "数据日期"]""", "question":"内容库访问时长小于1小时,且来自美术部的用户是哪些", - "prior_schema_links":"""['美术部'->部门]""", + "priorSchemaLinks":"""['美术部'->部门]""", "analysis": """让我们一步一步地思考。在问题“内容库访问时长小于1小时,且来自美术部的用户是哪些“中,我们被问: “用户是哪些”,所以我们需要column=[用户名] ”美术部的“,所以我们需要column=[部门], cell values = ['美术部'],所以有[部门:('美术部')] ”访问时长小于1小时“,所以我们需要column=[访问时长], cell values = [1],所以有[访问时长:(1)]""", - "schema_links":"""["用户名", "部门":("'美术部'"), "访问时长":(1)]""", + "schemaLinks":"""["用户名", "部门":("'美术部'"), "访问时长":(1)]""", "sql":"""select 用户名 from 内容库产品 where 部门 = '美术部' and 访问时长 < 1""" }, - { "current_date":"2023-08-31", - "table_name":"内容库产品", - "fields_list":"""["用户名", "部门", "模块", "访问时长", "访问次数", "访问人数", "数据日期"]""", + { "currentDate":"2023-08-31", + "tableName":"内容库产品", + "fieldsList":"""["用户名", "部门", "模块", "访问时长", "访问次数", "访问人数", "数据日期"]""", "question":"内容库pv最高的用户有哪些", - "prior_schema_links":"""[]""", + "priorSchemaLinks":"""[]""", "analysis": """让我们一步一步地思考。在问题“内容库pv最高的用户有哪些“中,我们被问: “用户有哪些”,所以我们需要column=[用户名] ”pv最高的“,所以我们需要column=[访问次数], cell values = [1],所以有[访问次数:(1)]""", - "schema_links":"""["用户名", "访问次数":(1)]""", + "schemaLinks":"""["用户名", "访问次数":(1)]""", "sql":"""select 用户名 from 内容库产品 order by 访问次数 desc limit 1""" }, - { "current_date":"2023-08-31", - "table_name":"艺人库", - "fields_list":"""["播放量层级", "播放量单调性", "播放量方差", "播放量突增类型", "播放量集中度", "歌手名", "歌手等级", "歌手类型", "歌手来源", "MPPM潮流人等级", "结算播放量", "运营播放量", "历史累计结算播放量", "有播放量歌曲数", "历史累计运营播放量", "付费用户结算播放量", "结算播放量占比", "运营播放份额", "免费用户结算播放占比", "完播量", "数据日期"]""", + { "currentDate":"2023-08-31", + "tableName":"艺人库", + "fieldsList":"""["播放量层级", "播放量单调性", "播放量方差", "播放量突增类型", "播放量集中度", "歌手名", "歌手等级", "歌手类型", "歌手来源", "MPPM潮流人等级", "结算播放量", "运营播放量", "历史累计结算播放量", "有播放量歌曲数", "历史累计运营播放量", "付费用户结算播放量", "结算播放量占比", "运营播放份额", "免费用户结算播放占比", "完播量", "数据日期"]""", "question":"近90天袁亚伟播放量平均值是多少", - "prior_schema_links":"""['152789226'->MPPM歌手ID]""", + "priorSchemaLinks":"""['152789226'->MPPM歌手ID]""", "analysis": """让我们一步一步地思考。在问题“近90天袁亚伟播放量平均值是多少“中,我们被问: “播放量平均值是多少”,所以我们需要column=[结算播放量] ”袁亚伟“,所以我们需要column=[歌手名], cell values = ['袁亚伟'],所以有[歌手名:('袁亚伟')] ”近90天“,所以我们需要column=[数据日期], cell values = [90],所以有[数据日期:(90)]""", - "schema_links":"""["结算播放量", "歌手名":("'袁亚伟'"), "数据日期":(90)]""", + "schemaLinks":"""["结算播放量", "歌手名":("'袁亚伟'"), "数据日期":(90)]""", "sql":"""select avg(结算播放量) from 艺人库 where 歌手名 = '袁亚伟' and datediff('day', 数据日期, '2023-08-31') <= 90 """ }, - { "current_date":"2023-08-31", - "table_name":"艺人库", - "fields_list":"""["播放量层级", "播放量单调性", "播放量方差", "播放量突增类型", "播放量集中度", "歌手名", "歌手等级", "歌手类型", "歌手来源", "MPPM潮流人等级", "结算播放量", "运营播放量", "历史累计结算播放量", "有播放量歌曲数", "历史累计运营播放量", "付费用户结算播放量", "结算播放量占比", "运营播放份额", "免费用户结算播放占比", "完播量", "数据日期"]""", + { "currentDate":"2023-08-31", + "tableName":"艺人库", + "fieldsList":"""["播放量层级", "播放量单调性", "播放量方差", "播放量突增类型", "播放量集中度", "歌手名", "歌手等级", "歌手类型", "歌手来源", "MPPM潮流人等级", "结算播放量", "运营播放量", "历史累计结算播放量", "有播放量歌曲数", "历史累计运营播放量", "付费用户结算播放量", "结算播放量占比", "运营播放份额", "免费用户结算播放占比", "完播量", "数据日期"]""", "question":"周倩倩近7天结算播放量总和是多少", - "prior_schema_links":"""['199509'->MPPM歌手ID]""", + "priorSchemaLinks":"""['199509'->MPPM歌手ID]""", "analysis": """让我们一步一步地思考。在问题“周倩倩近7天结算播放量总和是多少“中,我们被问: “结算播放量总和是多少”,所以我们需要column=[结算播放量] ”周倩倩“,所以我们需要column=[歌手名], cell values = ['周倩倩'],所以有[歌手名:('周倩倩')] ”近7天“,所以我们需要column=[数据日期], cell values = [7],所以有[数据日期:(7)]""", - "schema_links":"""["结算播放量", "歌手名":("'周倩倩'"), "数据日期":(7)]""", + "schemaLinks":"""["结算播放量", "歌手名":("'周倩倩'"), "数据日期":(7)]""", "sql":"""select sum(结算播放量) from 艺人库 where 歌手名 = '周倩倩' and datediff('day', 数据日期, '2023-08-31') <= 7 """ }, - { "current_date":"2023-09-14", - "table_name":"内容库产品", - "fields_list":"""["部门", "模块", "用户名", "访问次数", "访问人数", "访问时长", "数据日期"]""", + { "currentDate":"2023-09-14", + "tableName":"内容库产品", + "fieldsList":"""["部门", "模块", "用户名", "访问次数", "访问人数", "访问时长", "数据日期"]""", "question":"内容库访问次数大于1k的部门是哪些", - "prior_schema_links":"""[]""", + "priorSchemaLinks":"""[]""", "analysis": """让我们一步一步地思考。在问题“内容库访问次数大于1k的部门是哪些“中,我们被问: “部门是哪些”,所以我们需要column=[部门] ”访问次数大于1k的“,所以我们需要column=[访问次数], cell values = [1000],所以有[访问次数:(1000)]""", - "schema_links":"""["部门", "访问次数":(1000)]""", + "schemaLinks":"""["部门", "访问次数":(1000)]""", "sql":"""select 部门 from 内容库产品 where 访问次数 > 1000""" }, - { "current_date":"2023-09-18", - "table_name":"歌曲库", - "fields_list":"""["歌曲名", "MPPM歌手ID", "歌曲版本", "歌曲类型", "MPPM歌曲ID", "是否严选窄口径歌曲", "是否严选宽口径歌曲", "是否潮流人歌曲", "超声波歌曲ID", "C音歌曲ID", "C音歌曲MID", "结算播放量", "运营播放量", "分享量", "收藏量", "运营搜播量", "结算搜播量", "拉新用户数", "拉活用户数", "分享率", "结算播放份额", "数据日期"]""", + { "currentDate":"2023-09-18", + "tableName":"歌曲库", + "fieldsList":"""["歌曲名", "MPPM歌手ID", "歌曲版本", "歌曲类型", "MPPM歌曲ID", "是否严选窄口径歌曲", "是否严选宽口径歌曲", "是否潮流人歌曲", "超声波歌曲ID", "C音歌曲ID", "C音歌曲MID", "结算播放量", "运营播放量", "分享量", "收藏量", "运营搜播量", "结算搜播量", "拉新用户数", "拉活用户数", "分享率", "结算播放份额", "数据日期"]""", "question":"陈亿训唱的所有的播放量大于20k的孤勇者有哪些", - "prior_schema_links":"""['199509'->MPPM歌手ID, '1527123'->MPPM歌曲ID]""", + "priorSchemaLinks":"""['199509'->MPPM歌手ID, '1527123'->MPPM歌曲ID]""", "analysis": """让我们一步一步地思考。在问题“陈亿训唱的所有的播放量大于20k的孤勇者有哪些“中,我们被问: “孤勇者有哪些”,所以我们需要column=[歌曲名], cell values = ['孤勇者'],所以有[歌曲名:('孤勇者')] ”播放量大于20k的“,所以我们需要column=[结算播放量], cell values = [20000],所以有[结算播放量:(20000)] ”陈亿训唱的“,所以我们需要column=[歌手名], cell values = ['陈亿训'],所以有[歌手名:('陈亿训')]""", - "schema_links":"""["歌曲名":("'孤勇者'"), "结算播放量":(20000), "歌手名":("'陈亿训'")]""", + "schemaLinks":"""["歌曲名":("'孤勇者'"), "结算播放量":(20000), "歌手名":("'陈亿训'")]""", "sql":"""select 歌曲名 from 歌曲库 where 结算播放量 > 20000 and 歌手名 = '陈亿训' and 歌曲名 = '孤勇者'""" }, - { "current_date":"2023-09-18", - "table_name":"歌曲库", - "fields_list":"""["歌曲名", "歌曲版本", "歌手名", "歌曲类型", "发布时间", "MPPM歌曲ID", "是否严选窄口径歌曲", "是否严选宽口径歌曲", "是否潮流人歌曲", "超声波歌曲ID", "C音歌曲ID", "C音歌曲MID", "结算播放量", "运营播放量", "分享量", "收藏量", "运营搜播量", "结算搜播量", "拉新用户数", "拉活用户数", "分享率", "结算播放份额", "数据日期"]""", + { "currentDate":"2023-09-18", + "tableName":"歌曲库", + "fieldsList":"""["歌曲名", "歌曲版本", "歌手名", "歌曲类型", "发布时间", "MPPM歌曲ID", "是否严选窄口径歌曲", "是否严选宽口径歌曲", "是否潮流人歌曲", "超声波歌曲ID", "C音歌曲ID", "C音歌曲MID", "结算播放量", "运营播放量", "分享量", "收藏量", "运营搜播量", "结算搜播量", "拉新用户数", "拉活用户数", "分享率", "结算播放份额", "数据日期"]""", "question":"周洁轮去年发布的歌曲有哪些", - "prior_schema_links":"""['23109'->MPPM歌手ID]""", + "priorSchemaLinks":"""['23109'->MPPM歌手ID]""", "analysis": """让我们一步一步地思考。在问题“周洁轮去年发布的歌曲有哪些“中,我们被问: “歌曲有哪些”,所以我们需要column=[歌曲名] ”去年发布的“,所以我们需要column=[发布时间], cell values = [1],所以有[发布时间:(1)] ”周洁轮“,所以我们需要column=[歌手名], cell values = ['周洁轮'],所以有[歌手名:('周洁轮')]""", - "schema_links":"""["歌曲名", "发布时间":(1), "歌手名":("'周洁轮'")]""", + "schemaLinks":"""["歌曲名", "发布时间":(1), "歌手名":("'周洁轮'")]""", "sql":"""select 歌曲名 from 歌曲库 where datediff('year', 发布时间, '2023-09-18') <= 1 and 歌手名 = '周洁轮'""" }, - { "current_date":"2023-09-11", - "table_name":"艺人库", - "fields_list":"""["播放量层级", "播放量单调性", "播放量方差", "播放量突增类型", "播放量集中度", "歌手名", "歌手等级", "歌手类型", "歌手来源", "签约日期", "MPPM潮流人等级", "结算播放量", "运营播放量", "历史累计结算播放量", "有播放量歌曲数", "历史累计运营播放量", "付费用户结算播放量", "结算播放量占比", "运营播放份额", "免费用户结算播放占比", "完播量", "数据日期"]""", + { "currentDate":"2023-09-11", + "tableName":"艺人库", + "fieldsList":"""["播放量层级", "播放量单调性", "播放量方差", "播放量突增类型", "播放量集中度", "歌手名", "歌手等级", "歌手类型", "歌手来源", "签约日期", "MPPM潮流人等级", "结算播放量", "运营播放量", "历史累计结算播放量", "有播放量歌曲数", "历史累计运营播放量", "付费用户结算播放量", "结算播放量占比", "运营播放份额", "免费用户结算播放占比", "完播量", "数据日期"]""", "question":"我想要近半年签约的播放量前十的歌手有哪些", - "prior_schema_links":"""[]""", + "priorSchemaLinks":"""[]""", "analysis": """让我们一步一步地思考。在问题“我想要近半年签约的播放量前十的歌手“中,我们被问: “歌手有哪些”,所以我们需要column=[歌手名] ”播放量前十的“,所以我们需要column=[结算播放量], cell values = [10],所以有[结算播放量:(10)] ”近半年签约的“,所以我们需要column=[签约日期], cell values = [0.5],所以有[签约日期:(0.5)]""", - "schema_links":"""["歌手名", "结算播放量":(10), "签约日期":(0.5)]""", + "schemaLinks":"""["歌手名", "结算播放量":(10), "签约日期":(0.5)]""", "sql":"""select 歌手名 from 艺人库 where datediff('year', 签约日期, '2023-09-11') <= 0.5 order by 结算播放量 desc limit 10""" }, - { "current_date":"2023-08-12", - "table_name":"歌曲库", - "fields_list": """["发行日期", "歌曲语言", "歌曲来源", "歌曲流派", "歌曲名", "歌曲版本", "歌曲类型", "发行时间", "数据日期"]""", + { "currentDate":"2023-08-12", + "tableName":"歌曲库", + "fieldsList": """["发行日期", "歌曲语言", "歌曲来源", "歌曲流派", "歌曲名", "歌曲版本", "歌曲类型", "发行时间", "数据日期"]""", "question":"最近一年发行的歌曲中,有哪些在近7天播放超过一千万的", - "prior_schema_links":"""[]""", + "priorSchemaLinks":"""[]""", "analysis": """让我们一步一步地思考。在问题“最近一年发行的歌曲中,有哪些在近7天播放超过一千万的“中,我们被问: “发行的歌曲中,有哪些”,所以我们需要column=[歌曲名] ”最近一年发行的“,所以我们需要column=[发行日期], cell values = [1],所以有[发行日期:(1)] ”在近7天播放超过一千万的“,所以我们需要column=[数据日期, 结算播放量], cell values = [7, 10000000],所以有[数据日期:(7), 结算播放量:(10000000)]""", - "schema_links":"""["歌曲名", "发行日期":(1), "数据日期":(7), "结算播放量":(10000000)]""", + "schemaLinks":"""["歌曲名", "发行日期":(1), "数据日期":(7), "结算播放量":(10000000)]""", "sql":"""select 歌曲名 from 歌曲库 where datediff('year', 发行日期, '2023-08-12') <= 1 and datediff('day', 数据日期, '2023-08-12') <= 7 and 结算播放量 > 10000000""" }, - { "current_date":"2023-08-12", - "table_name":"歌曲库", - "fields_list": """["发行日期", "歌曲语言", "歌曲来源", "歌曲流派", "歌曲名", "歌曲版本", "歌曲类型", "发行时间", "数据日期"]""", + { "currentDate":"2023-08-12", + "tableName":"歌曲库", + "fieldsList": """["发行日期", "歌曲语言", "歌曲来源", "歌曲流派", "歌曲名", "歌曲版本", "歌曲类型", "发行时间", "数据日期"]""", "question":"今年以来发行的歌曲中,有哪些在近7天播放超过一千万的", - "prior_schema_links":"""[]""", + "priorSchemaLinks":"""[]""", "analysis": """让我们一步一步地思考。在问题“今年以来发行的歌曲中,有哪些在近7天播放超过一千万的“中,我们被问: “发行的歌曲中,有哪些”,所以我们需要column=[歌曲名] ”今年以来发行的“,所以我们需要column=[发行日期], cell values = [0],所以有[发行日期:(0)] ”在近7天播放超过一千万的“,所以我们需要column=[数据日期, 结算播放量], cell values = [7, 10000000],所以有[数据日期:(7), 结算播放量:(10000000)]""", - "schema_links":"""["歌曲名", "发行日期":(0), "数据日期":(7), "结算播放量":(10000000)]""", + "schemaLinks":"""["歌曲名", "发行日期":(0), "数据日期":(7), "结算播放量":(10000000)]""", "sql":"""select 歌曲名 from 歌曲库 where datediff('year', 发行日期, '2023-08-12') <= 0 and datediff('day', 数据日期, '2023-08-12') <= 7 and 结算播放量 > 10000000""" }, - { "current_date":"2023-08-12", - "table_name":"歌曲库", - "fields_list": """["发行日期", "歌曲语言", "歌曲来源", "歌曲流派", "歌曲名", "歌曲版本", "歌曲类型", "发行时间", "数据日期"]""", + { "currentDate":"2023-08-12", + "tableName":"歌曲库", + "fieldsList": """["发行日期", "歌曲语言", "歌曲来源", "歌曲流派", "歌曲名", "歌曲版本", "歌曲类型", "发行时间", "数据日期"]""", "question":"2023年以来发行的歌曲中,有哪些在近7天播放超过一千万的", - "prior_schema_links":"""['514129144'->MPPM歌曲ID]""", + "priorSchemaLinks":"""['514129144'->MPPM歌曲ID]""", "analysis": """让我们一步一步地思考。在问题“2023年以来发行的歌曲中,有哪些在近7天播放超过一千万的“中,我们被问: “发行的歌曲中,有哪些”,所以我们需要column=[歌曲名] ”2023年以来发行的“,所以我们需要column=[发行日期], cell values = ['2023-01-01'],所以有[发行日期:('2023-01-01')] ”在近7天播放超过一千万的“,所以我们需要column=[数据日期, 结算播放量], cell values = [7, 10000000],所以有[数据日期:(7), 结算播放量:(10000000)]""", - "schema_links":"""["歌曲名", "发行日期":("'2023-01-01'"), "数据日期":(7), "结算播放量":(10000000)]""", + "schemaLinks":"""["歌曲名", "发行日期":("'2023-01-01'"), "数据日期":(7), "结算播放量":(10000000)]""", "sql":"""select 歌曲名 from 歌曲库 where 发行日期 >= '2023-01-01' and datediff('day', 数据日期, '2023-08-12') <= 7 and 结算播放量 > 10000000""" }, - { "current_date":"2023-08-01", - "table_name":"歌曲库", - "fields_list":"""["歌曲名", "歌曲版本", "歌手名", "歌曲类型", "发布时间", "MPPM歌曲ID", "是否严选窄口径歌曲", "是否严选宽口径歌曲", "是否潮流人歌曲", "超声波歌曲ID", "C音歌曲ID", "C音歌曲MID", "结算播放量", "运营播放量", "分享量", "收藏量", "运营搜播量", "结算搜播量", "拉新用户数", "拉活用户数", "分享率", "结算播放份额", "数据日期"]""", + { "currentDate":"2023-08-01", + "tableName":"歌曲库", + "fieldsList":"""["歌曲名", "歌曲版本", "歌手名", "歌曲类型", "发布时间", "MPPM歌曲ID", "是否严选窄口径歌曲", "是否严选宽口径歌曲", "是否潮流人歌曲", "超声波歌曲ID", "C音歌曲ID", "C音歌曲MID", "结算播放量", "运营播放量", "分享量", "收藏量", "运营搜播量", "结算搜播量", "拉新用户数", "拉活用户数", "分享率", "结算播放份额", "数据日期"]""", "question":"周洁轮2023年6月之后发布的歌曲有哪些", - "prior_schema_links":"""['23109'->MPPM歌手ID]""", + "priorSchemaLinks":"""['23109'->MPPM歌手ID]""", "analysis": """让我们一步一步地思考。在问题“周洁轮2023年6月之后发布的歌曲有哪些“中,我们被问: “歌曲有哪些”,所以我们需要column=[歌曲名] ”2023年6月之后发布的“,所以我们需要column=[发布时间], cell values = ['2023-06-01'],所以有[发布时间:('2023-06-01')] ”周洁轮“,所以我们需要column=[歌手名], cell values = ['周洁轮'],所以有[歌手名:('周洁轮')]""", - "schema_links":"""["歌曲名", "发布时间":("'2023-06-01'"), "歌手名":("'周洁轮'")]""", + "schemaLinks":"""["歌曲名", "发布时间":("'2023-06-01'"), "歌手名":("'周洁轮'")]""", "sql":"""select 歌曲名 from 歌曲库 where 发布时间 >= '2023-06-01' and 歌手名 = '周洁轮'""" }, - { "current_date":"2023-08-01", - "table_name":"歌曲库", - "fields_list":"""["歌曲名", "歌曲版本", "歌手名", "歌曲类型", "发布时间", "MPPM歌曲ID", "是否严选窄口径歌曲", "是否严选宽口径歌曲", "是否潮流人歌曲", "超声波歌曲ID", "C音歌曲ID", "C音歌曲MID", "结算播放量", "运营播放量", "分享量", "收藏量", "运营搜播量", "结算搜播量", "拉新用户数", "拉活用户数", "分享率", "结算播放份额", "数据日期"]""", + { "currentDate":"2023-08-01", + "tableName":"歌曲库", + "fieldsList":"""["歌曲名", "歌曲版本", "歌手名", "歌曲类型", "发布时间", "MPPM歌曲ID", "是否严选窄口径歌曲", "是否严选宽口径歌曲", "是否潮流人歌曲", "超声波歌曲ID", "C音歌曲ID", "C音歌曲MID", "结算播放量", "运营播放量", "分享量", "收藏量", "运营搜播量", "结算搜播量", "拉新用户数", "拉活用户数", "分享率", "结算播放份额", "数据日期"]""", "question":"邓梓琦在2023年1月5日之后发布的歌曲中,有哪些播放量大于500W的?", - "prior_schema_links":"""['2312311'->MPPM歌手ID]""", + "priorSchemaLinks":"""['2312311'->MPPM歌手ID]""", "analysis": """让我们一步一步地思考。在问题“邓梓琦在2023年1月5日之后发布的歌曲中,有哪些播放量大于500W的?“中,我们被问: “歌曲中,有哪些”,所以我们需要column=[歌曲名] “播放量大于500W的”,所以我们需要column=[结算播放量], cell values = [5000000],所以有[结算播放量:(5000000)] ”邓梓琦在2023年1月5日之后发布的“,所以我们需要column=[发布时间], cell values = ['2023-01-05'],所以有[发布时间:('2023-01-05')] ”邓梓琦“,所以我们需要column=[歌手名], cell values = ['邓梓琦'],所以有[歌手名:('邓梓琦')]""", - "schema_links":"""["歌曲名", "结算播放量":(5000000), "发布时间":("'2023-01-05'"), "歌手名":("'邓梓琦'")]""", + "schemaLinks":"""["歌曲名", "结算播放量":(5000000), "发布时间":("'2023-01-05'"), "歌手名":("'邓梓琦'")]""", "sql":"""select 歌曲名 from 歌曲库 where 发布时间 >= '2023-01-05' and 歌手名 = '邓梓琦' and 结算播放量 > 5000000""" }, - { "current_date":"2023-09-17", - "table_name":"歌曲库", - "fields_list":"""["歌曲名", "歌曲版本", "歌手名", "歌曲类型", "发布时间", "MPPM歌曲ID", "是否严选窄口径歌曲", "是否严选宽口径歌曲", "是否潮流人歌曲", "超声波歌曲ID", "C音歌曲ID", "C音歌曲MID", "结算播放量", "运营播放量", "分享量", "收藏量", "运营搜播量", "结算搜播量", "拉新用户数", "拉活用户数", "分享率", "结算播放份额", "数据日期"]""", + { "currentDate":"2023-09-17", + "tableName":"歌曲库", + "fieldsList":"""["歌曲名", "歌曲版本", "歌手名", "歌曲类型", "发布时间", "MPPM歌曲ID", "是否严选窄口径歌曲", "是否严选宽口径歌曲", "是否潮流人歌曲", "超声波歌曲ID", "C音歌曲ID", "C音歌曲MID", "结算播放量", "运营播放量", "分享量", "收藏量", "运营搜播量", "结算搜播量", "拉新用户数", "拉活用户数", "分享率", "结算播放份额", "数据日期"]""", "question":"2023年6月以后,张亮英播放量大于200万的歌曲有哪些?", - "prior_schema_links":"""['45453'->MPPM歌手ID]""", + "priorSchemaLinks":"""['45453'->MPPM歌手ID]""", "analysis": """让我们一步一步地思考。在问题“2023年6月以后,张亮英播放量大于200万的歌曲有哪些?“中,我们被问: “播放量大于200万的”,所以我们需要column=[结算播放量], cell values = [2000000],所以有[结算播放量:(2000000)] ”2023年6月以后,张亮英“,所以我们需要column=[数据日期, 歌手名], cell values = ['2023-06-01', '张亮英'],所以有[数据日期:('2023-06-01'), 歌手名:('张亮英')], ”歌曲有哪些“,所以我们需要column=[歌曲名]""", - "schema_links":"""["结算播放量":(2000000), "数据日期":("'2023-06-01'"), "歌手名":("'张亮英'"), "歌曲名"]""", + "schemaLinks":"""["结算播放量":(2000000), "数据日期":("'2023-06-01'"), "歌手名":("'张亮英'"), "歌曲名"]""", "sql":"""select 歌曲名 from 歌曲库 where 数据日期 >= '2023-06-01' and 歌手名 = '张亮英' and 结算播放量 > 2000000""" }, - { "current_date":"2023-08-16", - "table_name":"歌曲库", - "fields_list":"""["歌曲名", "歌曲版本", "歌手名", "歌曲类型", "发布时间", "MPPM歌曲ID", "是否严选窄口径歌曲", "是否严选宽口径歌曲", "是否潮流人歌曲", "超声波歌曲ID", "C音歌曲ID", "C音歌曲MID", "结算播放量", "运营播放量", "分享量", "收藏量", "运营搜播量", "结算搜播量", "拉新用户数", "拉活用户数", "分享率", "结算播放份额", "数据日期"]""", + { "currentDate":"2023-08-16", + "tableName":"歌曲库", + "fieldsList":"""["歌曲名", "歌曲版本", "歌手名", "歌曲类型", "发布时间", "MPPM歌曲ID", "是否严选窄口径歌曲", "是否严选宽口径歌曲", "是否潮流人歌曲", "超声波歌曲ID", "C音歌曲ID", "C音歌曲MID", "结算播放量", "运营播放量", "分享量", "收藏量", "运营搜播量", "结算搜播量", "拉新用户数", "拉活用户数", "分享率", "结算播放份额", "数据日期"]""", "question":"2021年6月以后发布的李雨纯的播放量大于20万的歌曲有哪些", - "prior_schema_links":"""['23109'->MPPM歌手ID]""", + "priorSchemaLinks":"""['23109'->MPPM歌手ID]""", "analysis": """让我们一步一步地思考。在问题“2021年6月以后发布的李雨纯的播放量大于20万的歌曲有哪些“中,我们被问: “播放量大于20万的”,所以我们需要column=[结算播放量], cell values = [200000],所以有[结算播放量:(200000)] ”2021年6月以后发布的“,所以我们需要column=[发布时间], cell values = ['2021-06-01'],所以有[发布时间:('2021-06-01')] ”李雨纯“,所以我们需要column=[歌手名], cell values = ['李雨纯'],所以有[歌手名:('李雨纯')]""", - "schema_links":"""["结算播放量":(200000), "发布时间":("'2021-06-01'"), "歌手名":("'李雨纯'")]""", + "schemaLinks":"""["结算播放量":(200000), "发布时间":("'2021-06-01'"), "歌手名":("'李雨纯'")]""", "sql":"""select 歌曲名 from 歌曲库 where 发布时间 >= '2021-06-01' and 歌手名 = '李雨纯' and 结算播放量 > 200000""" }, - { "current_date":"2023-08-16", - "table_name":"歌曲库", - "fields_list":"""["歌曲名", "歌曲版本", "歌手名", "歌曲类型", "发布时间", "MPPM歌曲ID", "是否严选窄口径歌曲", "是否严选宽口径歌曲", "是否潮流人歌曲", "超声波歌曲ID", "C音歌曲ID", "C音歌曲MID", "结算播放量", "运营播放量", "分享量", "收藏量", "运营搜播量", "结算搜播量", "拉新用户数", "拉活用户数", "分享率", "结算播放份额", "数据日期"]""", + { "currentDate":"2023-08-16", + "tableName":"歌曲库", + "fieldsList":"""["歌曲名", "歌曲版本", "歌手名", "歌曲类型", "发布时间", "MPPM歌曲ID", "是否严选窄口径歌曲", "是否严选宽口径歌曲", "是否潮流人歌曲", "超声波歌曲ID", "C音歌曲ID", "C音歌曲MID", "结算播放量", "运营播放量", "分享量", "收藏量", "运营搜播量", "结算搜播量", "拉新用户数", "拉活用户数", "分享率", "结算播放份额", "数据日期"]""", "question":"刘锝桦在1992年4月2日到2020年5月2日之间发布的播放量大于20万的歌曲有哪些", - "prior_schema_links":"""['4234234'->MPPM歌手ID]""", + "priorSchemaLinks":"""['4234234'->MPPM歌手ID]""", "analysis": """让我们一步一步地思考。在问题“刘锝桦在1992年4月2日到2020年5月2日之间发布的播放量大于20万的歌曲有哪些“中,我们被问: “播放量大于20万的”,所以我们需要column=[结算播放量], cell values = [200000],所以有[结算播放量:(200000)] ”1992年4月2日到2020年5月2日之间发布的“, 所以我们需要column=[发布时间], cell values = ['1992-04-02', '2020-05-02'],所以有[发布时间:('1992-04-02', '2020-05-02')] ”刘锝桦“,所以我们需要column=[歌手名], cell values = ['刘锝桦'],所以有[歌手名:('刘锝桦')]""", - "schema_links":"""["结算播放量":(200000), "发布时间":("'1992-04-02'", "'2020-05-02'"), "歌手名":("'刘锝桦'")]""", + "schemaLinks":"""["结算播放量":(200000), "发布时间":("'1992-04-02'", "'2020-05-02'"), "歌手名":("'刘锝桦'")]""", "sql":"""select 歌曲名 from 歌曲库 where 发布时间 >= '1992-04-02' and 发布时间 <= '2020-05-02' and 歌手名 = '刘锝桦' and 结算播放量 > 200000""" }, { - "current_date":"2023-09-04", - "table_name":"内容库产品", - "fields_list":"""["用户名", "部门", "模块", "访问时长", "访问次数", "访问人数", "数据日期"]""", + "currentDate":"2023-09-04", + "tableName":"内容库产品", + "fieldsList":"""["用户名", "部门", "模块", "访问时长", "访问次数", "访问人数", "数据日期"]""", "question":"内容库近30天访问次数的平均数", - "prior_schema_links":"""[]""", + "priorSchemaLinks":"""[]""", "analysis": """让我们一步一步地思考。在问题“内容库近30天访问次数的平均数“中,我们被问: “访问次数的平均数”,所以我们需要column=[访问次数] ”内容库近30天“,所以我们需要column=[数据日期], cell values = [30],所以有[数据日期:(30)]""", - "schema_links":"""["访问次数", "数据日期":(30)]""", + "schemaLinks":"""["访问次数", "数据日期":(30)]""", "sql":"""select avg(访问次数) from 内容库产品 where datediff('day', 数据日期, '2023-09-04') <= 30 """ }, { - "current_date":"2023-09-04", - "table_name":"内容库产品", - "fields_list":"""["用户名", "部门", "模块", "访问时长", "访问次数", "访问人数", "数据日期"]""", + "currentDate":"2023-09-04", + "tableName":"内容库产品", + "fieldsList":"""["用户名", "部门", "模块", "访问时长", "访问次数", "访问人数", "数据日期"]""", "question":"内容库近半年哪个月的访问次数汇总最高", - "prior_schema_links":"""[]""", + "priorSchemaLinks":"""[]""", "analysis": """让我们一步一步地思考。在问题“内容库近半年哪个月的访问次数汇总最高“中,我们被问: “访问次数汇总最高”,所以我们需要column=[访问次数], cell values = [1],所以有[访问次数:(1)] ”内容库近半年“,所以我们需要column=[数据日期], cell values = [0.5],所以有[数据日期:(0.5)]""", - "schema_links":"""["访问次数":(1), "数据日期":(0.5)]""", + "schemaLinks":"""["访问次数":(1), "数据日期":(0.5)]""", "sql":"""select MONTH(数据日期), sum(访问次数) from 内容库产品 where datediff('year', 数据日期, '2023-09-04') <= 0.5 group by MONTH(数据日期) order by sum(访问次数) desc limit 1""" }, { - "current_date":"2023-09-04", - "table_name":"内容库产品", - "fields_list":"""["用户名", "部门", "模块", "访问时长", "访问次数", "访问人数", "数据日期"]""", + "currentDate":"2023-09-04", + "tableName":"内容库产品", + "fieldsList":"""["用户名", "部门", "模块", "访问时长", "访问次数", "访问人数", "数据日期"]""", "question":"内容库近半年每个月的平均访问次数", - "prior_schema_links":"""[]""", + "priorSchemaLinks":"""[]""", "analysis": """让我们一步一步地思考。在问题“内容库近半年每个月的平均访问次数“中,我们被问: “每个月的平均访问次数”,所以我们需要column=[访问次数] ”内容库近半年“,所以我们需要column=[数据日期], cell values = [0.5],所以有[数据日期:(0.5)]""", - "schema_links":"""["访问次数", "数据日期":(0.5)]""", + "schemaLinks":"""["访问次数", "数据日期":(0.5)]""", "sql":"""select MONTH(数据日期), avg(访问次数) from 内容库产品 where datediff('year', 数据日期, '2023-09-04') <= 0.5 group by MONTH(数据日期)""" }, { - "current_date":"2023-09-10", - "table_name":"内容库产品", - "fields_list":"""["用户名", "部门", "模块", "访问时长", "访问次数", "访问人数", "数据日期"]""", + "currentDate":"2023-09-10", + "tableName":"内容库产品", + "fieldsList":"""["用户名", "部门", "模块", "访问时长", "访问次数", "访问人数", "数据日期"]""", "question":"内容库 按部门统计访问次数 top10 的部门", - "prior_schema_links":"""[]""", + "priorSchemaLinks":"""[]""", "analysis": """让我们一步一步地思考。在问题“内容库 按部门统计访问次数 top10 的部门“中,我们被问: “访问次数 top10 的部门”,所以我们需要column=[访问次数], cell values = [10],所以有[访问次数:(10)] ”内容库 按部门统计“,所以我们需要column=[部门]""", - "schema_links":"""["访问次数":(10), "部门"]""", + "schemaLinks":"""["访问次数":(10), "部门"]""", "sql":"""select 部门, sum(访问次数) from 内容库产品 group by 部门 order by sum(访问次数) desc limit 10""" }, { - "current_date":"2023-09-10", - "table_name":"内容库产品", - "fields_list":"""["用户名", "部门", "模块", "访问时长", "访问次数", "访问人数", "数据日期"]""", + "currentDate":"2023-09-10", + "tableName":"内容库产品", + "fieldsList":"""["用户名", "部门", "模块", "访问时长", "访问次数", "访问人数", "数据日期"]""", "question":"超音速 近7个月,月度总访问量超过 2万的月份", - "prior_schema_links":"""[]""", + "priorSchemaLinks":"""[]""", "analysis": """让我们一步一步地思考。在问题“超音速 近7个月,月度总访问量超过 2万的月份“中,我们被问: “月度总访问量超过 2万的月份”,所以我们需要column=[访问次数], cell values = [20000],所以有[访问次数:(20000)] ”超音速 近7个月“,所以我们需要column=[数据日期], cell values = [7],所以有[数据日期:(7)]""", - "schema_links":"""["访问次数":(20000), "数据日期":(7)]""", + "schemaLinks":"""["访问次数":(20000), "数据日期":(7)]""", "sql":"""select MONTH(数据日期) from 内容库产品 where datediff('day', 数据日期, '2023-09-10') <= 7 group by MONTH(数据日期) having sum(访问次数) > 20000""" }, { - "current_date":"2023-09-10", - "table_name":"歌曲库", - "fields_list":"""["歌曲语言", "歌曲来源", "运营播放量", "播放量", "歌曲名", "结算播放量", "专辑名", "发布日期", "歌曲版本", "歌曲类型", "数据日期"]""", + "currentDate":"2023-09-10", + "tableName":"歌曲库", + "fieldsList":"""["歌曲语言", "歌曲来源", "运营播放量", "播放量", "歌曲名", "结算播放量", "专辑名", "发布日期", "歌曲版本", "歌曲类型", "数据日期"]""", "question":"2022年7月到2023年7月之间发布到歌曲,按播放量取top 100,再按月粒度来统计近1年的运营播放量", - "prior_schema_links":"""[]""", + "priorSchemaLinks":"""[]""", "analysis": """让我们一步一步地思考。在问题“2022年7月到2023年7月之间发布到歌曲,按播放量取top 100,再按月粒度来统计近1年的运营播放量“中,我们被问: “按月粒度来统计近1年的运营播放量”,所以我们需要column=[运营播放量, 数据日期], cell values = [1],所以有[运营播放量, 数据日期:(1)] ”按播放量取top 100“,所以我们需要column=[播放量], cell values = [100],所以有[播放量:(100)] “2022年7月到2023年7月之间发布到歌曲”,所以我们需要column=[发布日期], cell values = ['2022-07-01', '2023-07-01'],所以有[发布日期:('2022-07-01', '2023-07-01')]""", - "schema_links":"""["运营播放量", "数据日期":(1), "播放量":(100), "发布日期":("'2022-07-01'", "'2023-07-01'")]""", + "schemaLinks":"""["运营播放量", "数据日期":(1), "播放量":(100), "发布日期":("'2022-07-01'", "'2023-07-01'")]""", "sql":"""select MONTH(数据日期), sum(运营播放量) from (select 数据日期, 运营播放量 from 歌曲库 where 发布日期 >= '2022-07-01' and 发布日期 <= '2023-07-01' order by 播放量 desc limit 100) t where datediff('year', 数据日期, '2023-09-10') <= 1 group by MONTH(数据日期)""" }, { - "current_date":"2023-09-10", - "table_name":"歌曲库", - "fields_list":"""["歌曲语言", "歌曲来源", "运营播放量", "播放量", "歌曲名", "结算播放量", "专辑名", "发布日期", "歌曲版本", "歌曲类型", "数据日期"]""", + "currentDate":"2023-09-10", + "tableName":"歌曲库", + "fieldsList":"""["歌曲语言", "歌曲来源", "运营播放量", "播放量", "歌曲名", "结算播放量", "专辑名", "发布日期", "歌曲版本", "歌曲类型", "数据日期"]""", "question":"2022年7月到2023年7月之间发布到歌曲,按播放量取top100,再按月粒度来统计近1年的运营播放量之和,筛选出其中运营播放量之和大于2k的月份", - "prior_schema_links":"""[]""", + "priorSchemaLinks":"""[]""", "analysis": """让我们一步一步地思考。在问题“2022年7月到2023年7月之间发布到歌曲,按播放量取top100,再按月粒度来统计近1年的运营播放量之和,筛选出其中运营播放量之和大于2k的月份“中,我们被问: “筛选出其中运营播放量之和大于2k的月份”,所以我们需要column=[运营播放量], cell values = [2000],所以有[运营播放量:(2000)] ”按月粒度来统计近1年的运营播放量之和“,所以我们需要column=[数据日期], cell values = [1],所以有[数据日期:(1)] ”按播放量取top100“,所以我们需要column=[播放量], cell values = [100],所以有[播放量:(100)] ”2022年7月到2023年7月之间发布到歌曲“,所以我们需要column=[发布日期], cell values = ['2022-07-01', '2023-07-01'],所以有[发布日期:('2022-07-01', '2023-07-01')]""", - "schema_links":"""["运营播放量":(2000), "数据日期":(1), "播放量":(100), "发布日期":("'2022-07-01'", "'2023-07-01'")]""", + "schemaLinks":"""["运营播放量":(2000), "数据日期":(1), "播放量":(100), "发布日期":("'2022-07-01'", "'2023-07-01'")]""", "sql":"""select MONTH(数据日期), sum(运营播放量) from (select 数据日期, 运营播放量 from 歌曲库 where 发布日期 >= '2022-07-01' and 发布日期 <= '2023-07-01' order by 播放量 desc limit 100) t where datediff('year', 数据日期, '2023-09-10') <= 1 group by MONTH(数据日期) having sum(运营播放量) > 2000""" + }, + { + "currentDate":"2023-11-01", + "tableName":"营销月模型", + "fieldsList":"""["国家中文名", "机型类别", "销量", "数据日期"]""", + "question":"今年智能机在哪个国家的销量之和最高", + "priorSchemaLinks":"""['智能机'->机型类别]""", + "analysis": """让我们一步一步地思考。在问题“今年智能机在哪个国家的销量之和最高“中,我们被问: +“销量最高”,所以我们需要column=[销量], cell values = [1],所以有[销量:(1)] +”今年“,所以我们需要column=[数据日期], cell values = ['2023-01-01', '2023-11-01'],所以有[数据日期:('2023-01-01', '2023-11-01')] +”智能机“,所以我们需要column=[机型类别], cell values = ['智能机'],所以有[机型类别:('智能机')]""", + "schemaLinks":"""["销量":(1), "数据日期":("'2023-01-01'", "'2023-11-01'"), "机型类别":("'智能机'")]""", + "sql":"""select 国家中文名, sum(销量) from 营销月模型 where 机型类别 = '智能机' and 数据日期 >= '2023-01-01' and 数据日期 <= '2023-11-01' group by 国家中文名 order by sum(销量) desc limit 1""" } ] \ No newline at end of file diff --git a/chat/core/src/main/python/services/sql/constructor.py b/chat/core/src/main/python/services/sql/constructor.py index 3d4ea47e8..6d8e9e5bf 100644 --- a/chat/core/src/main/python/services/sql/constructor.py +++ b/chat/core/src/main/python/services/sql/constructor.py @@ -11,41 +11,52 @@ from instances.logging_instance import logger from services.query_retrieval.retriever import ChromaCollectionRetriever class FewShotPromptTemplate2(object): - def __init__(self, collection:Collection, few_shot_examples:List[Mapping[str, str]], - retrieval_key:str, few_shot_seperator:str = "\n\n") -> None: + def __init__(self, collection:Collection, retrieval_key:str, few_shot_seperator:str = "\n\n") -> None: self.collection = collection self.few_shot_retriever = ChromaCollectionRetriever(self.collection) - self.few_shot_examples = few_shot_examples self.retrieval_key = retrieval_key self.few_shot_seperator = few_shot_seperator - def add_few_shot_example(self, examples: List[Mapping[str, str]])-> None: + def add_few_shot_example(self, example_ids: List[str] , example_units: List[Mapping[str, str]])-> None: query_text_list = [] - query_id_list = [] - for idx, example in enumerate(examples): - query_text_list.append(example[self.retrieval_key]) - query_id_list.append(str(idx)) + + for idx, example_unit in enumerate(example_units): + query_text_list.append(example_unit[self.retrieval_key]) - self.few_shot_retriever.add_queries(query_text_list=query_text_list, query_id_list=query_id_list, metadatas=examples) + self.few_shot_retriever.add_queries(query_text_list=query_text_list, query_id_list=example_ids, metadatas=example_units) - def reload_few_shot_example(self, examples: List[Mapping[str, str]])-> None: - logger.info(f"original sql_examples_collection size: {self.few_shot_retriever.get_query_size()}") + def update_few_shot_example(self, example_ids: List[str] , example_units: List[Mapping[str, str]])-> None: + query_text_list = [] + + for idx, example_unit in enumerate(example_units): + query_text_list.append(example_unit[self.retrieval_key]) + + self.few_shot_retriever.update_queries(query_text_list=query_text_list, query_id_list=example_ids, metadatas=example_units) + + def delete_few_shot_example(self, example_ids: List[str])-> None: + self.few_shot_retriever.delete_queries_by_ids(query_ids=example_ids) + + def count_few_shot_example(self)-> int: + return self.few_shot_retriever.get_query_size() + + def reload_few_shot_example(self, example_ids: List[str] , example_units: List[Mapping[str, str]])-> None: + logger.info(f"original {self.collection.name} size: {self.few_shot_retriever.get_query_size()}") self.few_shot_retriever.empty_query_collection() - logger.info(f"emptied sql_examples_collection size: {self.few_shot_retriever.get_query_size()}") + logger.info(f"emptied {self.collection.name} size: {self.few_shot_retriever.get_query_size()}") - self.add_few_shot_example(examples=examples) - logger.info(f"reloaded sql_examples_collection size: {self.few_shot_retriever.get_query_size()}") + self.add_few_shot_example(example_ids=example_ids, example_units=example_units) + logger.info(f"reloaded {self.collection.name} size: {self.few_shot_retriever.get_query_size()}") def _sub_dict(self, d:Mapping[str, str], keys:List[str])-> Mapping[str, str]: return {k:d[k] for k in keys if k in d} - def retrieve_few_shot_example(self, query_text: str, retrieval_num: int)-> List[Mapping[str, str]]: + def retrieve_few_shot_example(self, query_text: str, retrieval_num: int, filter_condition: Mapping[str,str] =None)-> List[Mapping[str, str]]: query_text_list = [query_text] retrieval_res_list = self.few_shot_retriever.retrieval_query_run(query_texts_list=query_text_list, - filter_condition=None, n_results=retrieval_num) + filter_condition=filter_condition, n_results=retrieval_num) retrieval_res_unit_list = retrieval_res_list[0]['retrieval'] return retrieval_res_unit_list @@ -62,4 +73,3 @@ class FewShotPromptTemplate2(object): few_shot_example_str = self.few_shot_seperator.join(few_shot_example_str_unit_list) return few_shot_example_str - diff --git a/chat/core/src/main/python/services/sql/examples_reload_run.py b/chat/core/src/main/python/services/sql/examples_reload_run.py index 1d93d3bc9..67c3558e6 100644 --- a/chat/core/src/main/python/services/sql/examples_reload_run.py +++ b/chat/core/src/main/python/services/sql/examples_reload_run.py @@ -15,7 +15,7 @@ from instances.logging_instance import logger from config.config_parse import (TEXT2DSL_EXAMPLE_NUM, TEXT2DSL_FEWSHOTS_NUM, TEXT2DSL_SELF_CONSISTENCY_NUM, LLMPARSER_HOST, LLMPARSER_PORT, TEXT2DSL_IS_SHORTCUT, TEXT2DSL_IS_SELF_CONSISTENCY) -from few_shot_example.sql_exampler import examplars as sql_examplars +from few_shot_example.sql_examplar import examplars as sql_examplars def text2sql_agent_setting_update(llm_host:str, llm_port:str, @@ -43,9 +43,11 @@ def text2dsl_agent_wrapper_setting_update(llm_host:str, llm_port:str, is_shortcut:bool, is_self_consistency:bool, sql_examplars:List[Mapping[str, str]], example_nums:int, fewshot_nums:int, self_consistency_nums:int): + sql_ids = [str(i) for i in range(0, len(sql_examplars))] + url = f"http://{llm_host}:{llm_port}/query2sql_setting_update/" payload = {"isShortcut":is_shortcut, "isSelfConsistency":is_self_consistency, - "sqlExamplars":sql_examplars, + "sqlExamplars":sql_examplars, "sqlIds": sql_ids, "exampleNums":example_nums, "fewshotNums":fewshot_nums, "selfConsistencyNums":self_consistency_nums} headers = {'content-type': 'application/json'} response = requests.post(url, data=json.dumps(payload), headers=headers) diff --git a/chat/core/src/main/python/services/sql/run.py b/chat/core/src/main/python/services/sql/run.py index 9cb6c96d8..3ffaf3573 100644 --- a/chat/core/src/main/python/services/sql/run.py +++ b/chat/core/src/main/python/services/sql/run.py @@ -17,7 +17,7 @@ from instances.text2vec import Text2VecEmbeddingFunction from instances.chromadb_instance import client from instances.logging_instance import logger -from few_shot_example.sql_exampler import examplars as sql_examplars +from few_shot_example.sql_examplar import examplars as sql_examplars from config.config_parse import (TEXT2DSLAGENT_COLLECTION_NAME, TEXT2DSLAGENTCS_COLLECTION_NAME, TEXT2DSL_EXAMPLE_NUM, TEXT2DSL_FEWSHOTS_NUM, TEXT2DSL_SELF_CONSISTENCY_NUM, TEXT2DSL_IS_SHORTCUT, TEXT2DSL_IS_SELF_CONSISTENCY) @@ -32,12 +32,10 @@ text2dsl_agentcs_collection = client.get_or_create_collection(name=TEXT2DSLAGENT metadata={"hnsw:space": "cosine"}) text2dsl_agent_example_prompter = FewShotPromptTemplate2(collection=text2dsl_agent_collection, - few_shot_examples=sql_examplars, retrieval_key="question", few_shot_seperator='\n\n') text2dsl_agentcs_example_prompter = FewShotPromptTemplate2(collection=text2dsl_agentcs_collection, - few_shot_examples=sql_examplars, retrieval_key="question", few_shot_seperator='\n\n') @@ -47,9 +45,9 @@ text2sql_agent = Text2DSLAgent(num_fewshots=TEXT2DSL_EXAMPLE_NUM, text2sql_cs_agent = Text2DSLAgentConsistency(num_fewshots=TEXT2DSL_FEWSHOTS_NUM, num_examples=TEXT2DSL_EXAMPLE_NUM, num_self_consistency=TEXT2DSL_SELF_CONSISTENCY_NUM, sql_example_prompter=text2dsl_agentcs_example_prompter, llm=llm) -text2sql_agent.update_examples(sql_examplars, TEXT2DSL_EXAMPLE_NUM) - -text2sql_cs_agent.update_examples(sql_examplars, TEXT2DSL_EXAMPLE_NUM, TEXT2DSL_FEWSHOTS_NUM, TEXT2DSL_SELF_CONSISTENCY_NUM) +sql_ids = [str(i) for i in range(0, len(sql_examplars))] +text2sql_agent.reload_setting(sql_ids, sql_examplars, TEXT2DSL_EXAMPLE_NUM) +text2sql_cs_agent.reload_setting(sql_ids, sql_examplars, TEXT2DSL_EXAMPLE_NUM, TEXT2DSL_FEWSHOTS_NUM, TEXT2DSL_SELF_CONSISTENCY_NUM) text2sql_agent_router = Text2DSLAgentWrapper(sql_agent=text2sql_agent, sql_agent_cs=text2sql_cs_agent, diff --git a/chat/core/src/main/python/services/sql/sql_agent.py b/chat/core/src/main/python/services/sql/sql_agent.py index 0325cf25b..321463246 100644 --- a/chat/core/src/main/python/services/sql/sql_agent.py +++ b/chat/core/src/main/python/services/sql/sql_agent.py @@ -12,7 +12,6 @@ sys.path.append(os.path.dirname(os.path.abspath(__file__))) from instances.logging_instance import logger - from sql.constructor import FewShotPromptTemplate2 from sql.output_parser import schema_link_parse, combo_schema_link_parse, combo_sql_parse @@ -25,12 +24,25 @@ class Text2DSLAgent(object): self.sql_example_prompter = sql_example_prompter self.llm = llm - def update_examples(self, sql_examplars, num_fewshots): + def reload_setting(self, sql_example_ids: List[str], sql_example_units: List[Mapping[str,str]], num_fewshots: int): self.num_fewshots = num_fewshots - self.sql_example_prompter.reload_few_shot_example(sql_examplars) + + self.sql_example_prompter.reload_few_shot_example(sql_example_ids, sql_example_units) - def get_fewshot_examples(self, query_text: str)->List[Mapping[str, str]]: - few_shot_example_meta_list = self.sql_example_prompter.retrieve_few_shot_example(query_text, self.num_fewshots) + def add_examples(self, sql_example_ids: List[str], sql_example_units: List[Mapping[str,str]]): + self.sql_example_prompter.add_few_shot_example(sql_example_ids, sql_example_units) + + def update_examples(self, sql_example_ids: List[str], sql_example_units: List[Mapping[str,str]]): + self.sql_example_prompter.update_few_shot_example(sql_example_ids, sql_example_units) + + def delete_examples(self, sql_example_ids: List[str]): + self.sql_example_prompter.delete_few_shot_example(sql_example_ids) + + def count_examples(self): + return self.sql_example_prompter.count_few_shot_example() + + def get_fewshot_examples(self, query_text: str, filter_condition: Mapping[str,str])->List[Mapping[str, str]]: + few_shot_example_meta_list = self.sql_example_prompter.retrieve_few_shot_example(query_text, self.num_fewshots, filter_condition) return few_shot_example_meta_list @@ -41,14 +53,14 @@ class Text2DSLAgent(object): instruction = "# 根据数据库的表结构,参考先验信息,找出为每个问题生成SQL查询语句的schema_links" - schema_linking_example_keys = ["table_name", "fields_list", "prior_schema_links", "question", "analysis", "schema_links"] - schema_linking_example_template = "Table {table_name}, columns = {fields_list}, prior_schema_links = {prior_schema_links}\n问题:{question}\n分析:{analysis} 所以Schema_links是:\nSchema_links:{schema_links}" + schema_linking_example_keys = ["tableName", "fieldsList", "priorSchemaLinks", "question", "analysis", "schemaLinks"] + schema_linking_example_template = "Table {tableName}, columns = {fieldsList}, prior_schema_links = {priorSchemaLinks}\n问题:{question}\n分析:{analysis} 所以Schema_links是:\nSchema_links:{schemaLinks}" schema_linking_fewshot_prompt = self.sql_example_prompter.make_few_shot_example_prompt(few_shot_template=schema_linking_example_template, example_keys=schema_linking_example_keys, few_shot_example_meta_list=fewshot_example_list) - new_case_template = "Table {table_name}, columns = {fields_list}, prior_schema_links = {prior_schema_links}\n问题:{question}\n分析: 让我们一步一步地思考。" - new_case_prompt = new_case_template.format(table_name=domain_name, fields_list=fields_list, prior_schema_links=prior_schema_links_str, question=user_query) + new_case_template = "Table {tableName}, columns = {fieldsList}, prior_schema_links = {priorSchemaLinks}\n问题:{question}\n分析: 让我们一步一步地思考。" + new_case_prompt = new_case_template.format(tableName=domain_name, fieldsList=fields_list, priorSchemaLinks=prior_schema_links_str, question=user_query) schema_linking_prompt = instruction + '\n\n' + schema_linking_fewshot_prompt + '\n\n' + new_case_prompt return schema_linking_prompt @@ -57,16 +69,15 @@ class Text2DSLAgent(object): schema_link_str: str, data_date: str, fewshot_example_list:List[Mapping[str, str]])-> str: instruction = "# 根据schema_links为每个问题生成SQL查询语句" - sql_example_keys = ["question", "current_date", "table_name", "schema_links", "sql"] - sql_example_template = "问题:{question}\nCurrent_date:{current_date}\nTable {table_name}\nSchema_links:{schema_links}\nSQL:{sql}" - + sql_example_keys = ["question", "currentDate", "tableName", "schemaLinks", "sql"] + sql_example_template = "问题:{question}\nCurrent_date:{currentDate}\nTable {tableName}\nSchema_links:{schemaLinks}\nSQL:{sql}" sql_example_fewshot_prompt = self.sql_example_prompter.make_few_shot_example_prompt(few_shot_template=sql_example_template, example_keys=sql_example_keys, few_shot_example_meta_list=fewshot_example_list) - new_case_template = "问题:{question}\nCurrent_date:{current_date}\nTable {table_name}\nSchema_links:{schema_links}\nSQL:" - new_case_prompt = new_case_template.format(question=user_query, current_date=data_date, table_name=domain_name, schema_links=schema_link_str) + new_case_template = "问题:{question}\nCurrent_date:{currentDate}\nTable {tableName}\nSchema_links:{schemaLinks}\nSQL:" + new_case_prompt = new_case_template.format(question=user_query, currentDate=data_date, tableName=domain_name, schemaLinks=schema_link_str) sql_example_prompt = instruction + '\n\n' + sql_example_fewshot_prompt + '\n\n' + new_case_prompt @@ -83,20 +94,20 @@ class Text2DSLAgent(object): instruction = "# 根据数据库的表结构,参考先验信息,找出为每个问题生成SQL查询语句的schema_links,再根据schema_links为每个问题生成SQL查询语句" - example_keys = ["table_name", "fields_list", "prior_schema_links", "current_date", "question", "analysis", "schema_links", "sql"] - example_template = "Table {table_name}, columns = {fields_list}, prior_schema_links = {prior_schema_links}\nCurrent_date:{current_date}\n问题:{question}\n分析:{analysis} 所以Schema_links是:\nSchema_links:{schema_links}\nSQL:{sql}" + example_keys = ["tableName", "fieldsList", "priorSchemaLinks", "currentDate", "question", "analysis", "schemaLinks", "sql"] + example_template = "Table {tableName}, columns = {fieldsList}, prior_schema_links = {priorSchemaLinks}\nCurrent_date:{currentDate}\n问题:{question}\n分析:{analysis} 所以Schema_links是:\nSchema_links:{schemaLinks}\nSQL:{sql}" fewshot_prompt = self.sql_example_prompter.make_few_shot_example_prompt(few_shot_template=example_template, example_keys=example_keys, few_shot_example_meta_list=fewshot_example_list) - new_case_template = "Table {table_name}, columns = {fields_list}, prior_schema_links = {prior_schema_links}\nCurrent_date:{current_date}\n问题:{question}\n分析: 让我们一步一步地思考。" - new_case_prompt = new_case_template.format(table_name=domain_name, fields_list=fields_list, prior_schema_links=prior_schema_links_str, current_date=data_date, question=user_query) + new_case_template = "Table {tableName}, columns = {fieldsList}, prior_schema_links = {priorSchemaLinks}\nCurrent_date:{currentDate}\n问题:{question}\n分析: 让我们一步一步地思考。" + new_case_prompt = new_case_template.format(tableName=domain_name, fieldsList=fields_list, priorSchemaLinks=prior_schema_links_str, currentDate=data_date, question=user_query) prompt = instruction + '\n\n' + fewshot_prompt + '\n\n' + new_case_prompt return prompt - async def async_query2sql(self, query_text: str, + async def async_query2sql(self, query_text: str, filter_condition: Mapping[str,str], model_name: str, fields_list: List[str], data_date: str, prior_schema_links: Mapping[str,str], prior_exts: str): logger.info("query_text: {}".format(query_text)) @@ -106,10 +117,11 @@ class Text2DSLAgent(object): logger.info("prior_schema_links: {}".format(prior_schema_links)) logger.info("prior_exts: {}".format(prior_exts)) - query_text = query_text + ' 备注:'+prior_exts + if prior_exts != '': + query_text = query_text + ' 备注:'+prior_exts logger.info("query_text_prior_exts: {}".format(query_text)) - fewshot_example_meta_list = self.get_fewshot_examples(query_text) + fewshot_example_meta_list = self.get_fewshot_examples(query_text, filter_condition) schema_linking_prompt = self.generate_schema_linking_prompt(query_text, model_name, fields_list, prior_schema_links, fewshot_example_meta_list) logger.debug("schema_linking_prompt->{}".format(schema_linking_prompt)) schema_link_output = await self.llm._call_async(schema_linking_prompt) @@ -136,7 +148,7 @@ class Text2DSLAgent(object): return resp - async def async_query2sql_shortcut(self, query_text: str, + async def async_query2sql_shortcut(self, query_text: str, filter_condition: Mapping[str,str], model_name: str, fields_list: List[str], data_date: str, prior_schema_links: Mapping[str,str], prior_exts: str): logger.info("query_text: {}".format(query_text)) @@ -146,10 +158,11 @@ class Text2DSLAgent(object): logger.info("prior_schema_links: {}".format(prior_schema_links)) logger.info("prior_exts: {}".format(prior_exts)) - query_text = query_text + ' 备注:'+prior_exts + if prior_exts != '': + query_text = query_text + ' 备注:'+prior_exts logger.info("query_text_prior_exts: {}".format(query_text)) - fewshot_example_meta_list = self.get_fewshot_examples(query_text) + fewshot_example_meta_list = self.get_fewshot_examples(query_text, filter_condition) schema_linking_sql_shortcut_prompt = self.generate_schema_linking_sql_prompt(query_text, model_name, data_date, fields_list, prior_schema_links, fewshot_example_meta_list) logger.debug("schema_linking_sql_shortcut_prompt->{}".format(schema_linking_sql_shortcut_prompt)) schema_linking_sql_shortcut_output = await self.llm._call_async(schema_linking_sql_shortcut_prompt) @@ -183,16 +196,28 @@ class Text2DSLAgentConsistency(object): self.llm = llm self.sql_example_prompter = sql_example_prompter - def update_examples(self, sql_examplars, num_examples, num_fewshots, num_self_consistency): + def reload_setting(self, sql_example_ids:List[str], sql_example_units: List[Mapping[str, str]], num_examples:int, num_fewshots:int, num_self_consistency:int): self.num_fewshots = num_fewshots self.num_examples = num_examples assert self.num_fewshots <= self.num_examples self.num_self_consistency = num_self_consistency assert self.num_self_consistency >= 1 - self.sql_example_prompter.reload_few_shot_example(sql_examplars) + self.sql_example_prompter.reload_few_shot_example(sql_example_ids, sql_example_units) - def get_examples_candidates(self, query_text: str)->List[Mapping[str, str]]: - few_shot_example_meta_list = self.sql_example_prompter.retrieve_few_shot_example(query_text, self.num_examples) + def add_examples(self, sql_example_ids:List[str], sql_example_units: List[Mapping[str, str]]): + self.sql_example_prompter.add_few_shot_example(sql_example_ids, sql_example_units) + + def update_examples(self, sql_example_ids:List[str], sql_example_units: List[Mapping[str, str]]): + self.sql_example_prompter.update_few_shot_example(sql_example_ids, sql_example_units) + + def delete_examples(self, sql_example_ids:List[str]): + self.sql_example_prompter.delete_few_shot_example(sql_example_ids) + + def count_examples(self): + return self.sql_example_prompter.count_few_shot_example() + + def get_examples_candidates(self, query_text: str, filter_condition: Mapping[str, str])->List[Mapping[str, str]]: + few_shot_example_meta_list = self.sql_example_prompter.retrieve_few_shot_example(query_text, self.num_examples, filter_condition) return few_shot_example_meta_list @@ -211,14 +236,14 @@ class Text2DSLAgentConsistency(object): instruction = "# 根据数据库的表结构,参考先验信息,找出为每个问题生成SQL查询语句的schema_links" - schema_linking_example_keys = ["table_name", "fields_list", "prior_schema_links", "question", "analysis", "schema_links"] - schema_linking_example_template = "Table {table_name}, columns = {fields_list}, prior_schema_links = {prior_schema_links}\n问题:{question}\n分析:{analysis} 所以Schema_links是:\nSchema_links:{schema_links}" + schema_linking_example_keys = ["tableName", "fieldsList", "priorSchemaLinks", "question", "analysis", "schemaLinks"] + schema_linking_example_template = "Table {tableName}, columns = {fieldsList}, prior_schema_links = {priorSchemaLinks}\n问题:{question}\n分析:{analysis} 所以Schema_links是:\nSchema_links:{schemaLinks}" schema_linking_fewshot_prompt = self.sql_example_prompter.make_few_shot_example_prompt(few_shot_template=schema_linking_example_template, example_keys=schema_linking_example_keys, few_shot_example_meta_list=fewshot_example_list) - new_case_template = "Table {table_name}, columns = {fields_list}, prior_schema_links = {prior_schema_links}\n问题:{question}\n分析: 让我们一步一步地思考。" - new_case_prompt = new_case_template.format(table_name=domain_name, fields_list=fields_list, prior_schema_links=prior_schema_links_str, question=user_query) + new_case_template = "Table {tableName}, columns = {fieldsList}, prior_schema_links = {priorSchemaLinks}\n问题:{question}\n分析: 让我们一步一步地思考。" + new_case_prompt = new_case_template.format(tableName=domain_name, fieldsList=fields_list, priorSchemaLinks=prior_schema_links_str, question=user_query) schema_linking_prompt = instruction + '\n\n' + schema_linking_fewshot_prompt + '\n\n' + new_case_prompt return schema_linking_prompt @@ -236,16 +261,15 @@ class Text2DSLAgentConsistency(object): schema_link_str: str, data_date: str, fewshot_example_list:List[Mapping[str, str]])-> str: instruction = "# 根据schema_links为每个问题生成SQL查询语句" - sql_example_keys = ["question", "current_date", "table_name", "schema_links", "sql"] - sql_example_template = "问题:{question}\nCurrent_date:{current_date}\nTable {table_name}\nSchema_links:{schema_links}\nSQL:{sql}" - + sql_example_keys = ["question", "currentDate", "tableName", "schemaLinks", "sql"] + sql_example_template = "问题:{question}\nCurrent_date:{currentDate}\nTable {tableName}\nSchema_links:{schemaLinks}\nSQL:{sql}" sql_example_fewshot_prompt = self.sql_example_prompter.make_few_shot_example_prompt(few_shot_template=sql_example_template, example_keys=sql_example_keys, few_shot_example_meta_list=fewshot_example_list) - new_case_template = "问题:{question}\nCurrent_date:{current_date}\nTable {table_name}\nSchema_links:{schema_links}\nSQL:" - new_case_prompt = new_case_template.format(question=user_query, current_date=data_date, table_name=domain_name, schema_links=schema_link_str) + new_case_template = "问题:{question}\nCurrent_date:{currentDate}\nTable {tableName}\nSchema_links:{schemaLinks}\nSQL:" + new_case_prompt = new_case_template.format(question=user_query, currentDate=data_date, tableName=domain_name, schemaLinks=schema_link_str) sql_example_prompt = instruction + '\n\n' + sql_example_fewshot_prompt + '\n\n' + new_case_prompt @@ -302,7 +326,7 @@ class Text2DSLAgentConsistency(object): return sql_output_res_pool - async def tasks_run(self, user_query: str, domain_name: str, fields_list: List[str], prior_schema_links: Mapping[str,str], data_date: str, prior_exts: str): + async def tasks_run(self, user_query: str, filter_condition: Mapping[str, str], domain_name: str, fields_list: List[str], prior_schema_links: Mapping[str,str], data_date: str, prior_exts: str): logger.info("user_query: {}".format(user_query)) logger.info("domain_name: {}".format(domain_name)) logger.info("fields_list: {}".format(fields_list)) @@ -310,10 +334,11 @@ class Text2DSLAgentConsistency(object): logger.info("prior_schema_links: {}".format(prior_schema_links)) logger.info("prior_exts: {}".format(prior_exts)) - user_query = user_query + ' 备注:'+prior_exts + if prior_exts != '': + user_query = user_query + ' 备注:'+prior_exts logger.info("user_query_prior_exts: {}".format(user_query)) - fewshot_example_meta_list = self.get_examples_candidates(user_query) + fewshot_example_meta_list = self.get_examples_candidates(user_query, filter_condition) fewshot_example_list_combo = self.get_fewshot_example_combos(fewshot_example_meta_list) schema_linking_output_candidates = await self.generate_schema_linking_tasks(user_query, domain_name, fields_list, prior_schema_links, fewshot_example_list_combo) @@ -354,20 +379,20 @@ class Text2DSLAgentWrapper(object): self.is_shortcut = is_shortcut self.is_self_consistency = is_self_consistency - async def async_query2sql(self, query_text: str, + async def async_query2sql(self, query_text: str, filter_condition: Mapping[str,str], model_name: str, fields_list: List[str], data_date: str, prior_schema_links: Mapping[str,str], prior_exts: str): if self.is_self_consistency: logger.info("sql wrapper: self_consistency") - resp = await self.sql_agent_cs.tasks_run(user_query=query_text, domain_name=model_name, fields_list=fields_list, prior_schema_links=prior_schema_links, data_date=data_date, prior_exts=prior_exts) + resp = await self.sql_agent_cs.tasks_run(user_query=query_text, filter_condition=filter_condition, domain_name=model_name, fields_list=fields_list, prior_schema_links=prior_schema_links, data_date=data_date, prior_exts=prior_exts) return resp elif self.is_shortcut: logger.info("sql wrapper: shortcut") - resp = await self.sql_agent.async_query2sql_shortcut(query_text=query_text, model_name=model_name, fields_list=fields_list, data_date=data_date, prior_schema_links=prior_schema_links, prior_exts=prior_exts) + resp = await self.sql_agent.async_query2sql_shortcut(query_text=query_text, filter_condition=filter_condition, model_name=model_name, fields_list=fields_list, data_date=data_date, prior_schema_links=prior_schema_links, prior_exts=prior_exts) return resp else: logger.info("sql wrapper: normal") - resp = await self.sql_agent.async_query2sql(query_text=query_text, model_name=model_name, fields_list=fields_list, data_date=data_date, prior_schema_links=prior_schema_links, prior_exts=prior_exts) + resp = await self.sql_agent.async_query2sql(query_text=query_text, filter_condition=filter_condition, model_name=model_name, fields_list=fields_list, data_date=data_date, prior_schema_links=prior_schema_links, prior_exts=prior_exts) return resp def update_configs(self, is_shortcut, is_self_consistency, diff --git a/chat/core/src/main/python/services_router/query2sql_service.py b/chat/core/src/main/python/services_router/query2sql_service.py index 682457f69..9184e7431 100644 --- a/chat/core/src/main/python/services_router/query2sql_service.py +++ b/chat/core/src/main/python/services_router/query2sql_service.py @@ -40,11 +40,16 @@ async def query2sql(query_body: Mapping[str, Any]): else: prior_exts = query_body['priorExts'] + if 'filterCondition' not in query_body: + raise HTTPException(status_code=400, detail="filterCondition is not in query_body") + else: + filter_condition = query_body['filterCondition'] + model_name = schema['modelName'] fields_list = schema['fieldNameList'] prior_schema_links = {item['fieldValue']:item['fieldName'] for item in linking} - resp = await text2sql_agent_router.async_query2sql(query_text=query_text, model_name=model_name, fields_list=fields_list, + resp = await text2sql_agent_router.async_query2sql(query_text=query_text, filter_condition=filter_condition, model_name=model_name, fields_list=fields_list, data_date=current_date, prior_schema_links=prior_schema_links, prior_exts=prior_exts) return resp @@ -57,6 +62,11 @@ def query2sql_setting_update(query_body: Mapping[str, Any]): else: sql_examplars = query_body['sqlExamplars'] + if 'sqlIds' not in query_body: + raise HTTPException(status_code=400, detail="sqlIds is not in query_body") + else: + sql_ids = query_body['sqlIds'] + if 'exampleNums' not in query_body: raise HTTPException(status_code=400, detail="exampleNums is not in query_body") else: @@ -82,7 +92,70 @@ def query2sql_setting_update(query_body: Mapping[str, Any]): else: is_self_consistency = query_body['isSelfConsistency'] - text2sql_agent_router.update_configs(is_shortcut=is_shortcut, is_self_consistency=is_self_consistency, sql_examplars=sql_examplars, + text2sql_agent_router.update_configs(is_shortcut=is_shortcut, is_self_consistency=is_self_consistency, + sql_example_ids=sql_ids, sql_example_units=sql_examplars, num_examples=example_nums, num_fewshots=fewshot_nums, num_self_consistency=self_consistency_nums) return "success" + + +@router.post("/query2sql_add_examples") +def query2sql_add_examples(query_body: Mapping[str, Any]): + if 'sqlIds' not in query_body: + raise HTTPException(status_code=400, detail="sqlIds is not in query_body") + else: + sql_ids = query_body['sqlIds'] + + if 'sqlExamplars' not in query_body: + raise HTTPException(status_code=400, + detail="sqlExamplars is not in query_body") + else: + sql_examplars = query_body['sqlExamplars'] + + text2sql_agent_router.sql_agent.add_examples(sql_example_ids=sql_ids, sql_example_units=sql_examplars) + text2sql_agent_router.sql_agent_cs.add_examples(sql_example_ids=sql_ids, sql_example_units=sql_examplars) + + return "success" + + +@router.post("/query2sql_update_examples") +def query2sql_update_examples(query_body: Mapping[str, Any]): + if 'sqlIds' not in query_body: + raise HTTPException(status_code=400, detail="sqlIds is not in query_body") + else: + sql_ids = query_body['sqlIds'] + + if 'sqlExamplars' not in query_body: + raise HTTPException(status_code=400, + detail="sqlExamplars is not in query_body") + else: + sql_examplars = query_body['sqlExamplars'] + + text2sql_agent_router.sql_agent.update_examples(sql_example_ids=sql_ids, sql_example_units=sql_examplars) + text2sql_agent_router.sql_agent_cs.update_examples(sql_example_ids=sql_ids, sql_example_units=sql_examplars) + + return "success" + + +@router.post("/query2sql_delete_examples") +def query2sql_delete_examples(query_body: Mapping[str, Any]): + if 'sqlIds' not in query_body: + raise HTTPException(status_code=400, detail="sqlIds is not in query_body") + else: + sql_ids = query_body['sqlIds'] + + text2sql_agent_router.sql_agent.delete_examples(sql_example_ids=sql_ids) + text2sql_agent_router.sql_agent_cs.delete_examples(sql_example_ids=sql_ids) + + return "success" + + +@router.get("/query2sql_count_examples") +def query2sql_count_examples(): + sql_agent_examples_cnt = text2sql_agent_router.sql_agent.count_examples() + sql_agent_cs_examples_cnt = text2sql_agent_router.sql_agent_cs.count_examples() + + assert sql_agent_examples_cnt == sql_agent_cs_examples_cnt + + return sql_agent_examples_cnt +