(chore)(sytle) Apply Python styling by black via spotless maven plugin (#113)

* introduce spotless maven plugin with Python black for styling
This commit is contained in:
Bowen Liang
2023-09-25 10:57:14 +08:00
committed by GitHub
parent ec151d7b53
commit dbd259adb0
17 changed files with 838 additions and 537 deletions

View File

@@ -1,348 +1,371 @@
examplars= [ examplars = [
{ "current_date":"2020-12-01", {
"table_name":"内容库产品", "current_date": "2020-12-01",
"fields_list":"""["部门", "模块", "用户名", "访问次数", "访问人数", "访问时长", "数据日期"]""", "table_name": "内容库产品",
"question":"比较jackjchen和robinlee在内容库的访问次数", "fields_list": """["部门", "模块", "用户名", "访问次数", "访问人数", "访问时长", "数据日期"]""",
"prior_schema_links":"""['jackjchen'->用户名, 'robinlee'->用户名]""", "question": "比较jackjchen和robinlee在内容库的访问次数",
"prior_schema_links": """['jackjchen'->用户名, 'robinlee'->用户名]""",
"analysis": """让我们一步一步地思考。在问题“比较jackjchen和robinlee在内容库的访问次数“中我们被问 "analysis": """让我们一步一步地思考。在问题“比较jackjchen和robinlee在内容库的访问次数“中我们被问
“比较jackjchen和robinlee”所以我们需要column=[用户名] “比较jackjchen和robinlee”所以我们需要column=[用户名]
”内容库的访问次数“所以我们需要column=[访问次数] ”内容库的访问次数“所以我们需要column=[访问次数]
基于table和columns可能的cell values 是 = ['jackjchen', 'robinlee']。""", 基于table和columns可能的cell values 是 = ['jackjchen', 'robinlee']。""",
"schema_links":"""["用户名", "访问次数", "'jackjchen'", "'robinlee'"]""", "schema_links": """["用户名", "访问次数", "'jackjchen'", "'robinlee'"]""",
"sql":"""select 用户名, 访问次数 from 内容库产品 where 用户名 in ('jackjchen', 'robinlee') and 数据日期 = '2020-12-01' """ "sql": """select 用户名, 访问次数 from 内容库产品 where 用户名 in ('jackjchen', 'robinlee') and 数据日期 = '2020-12-01' """,
}, },
{ "current_date":"2022-11-06", {
"table_name":"内容库产品", "current_date": "2022-11-06",
"fields_list":"""["部门", "模块", "用户名", "访问次数", "访问人数", "访问时长", "数据日期"]""", "table_name": "内容库产品",
"question":"内容库近12个月访问人数 按部门", "fields_list": """["部门", "模块", "用户名", "访问次数", "访问人数", "访问时长", "数据日期"]""",
"prior_schema_links":"""[]""", "question": "内容库近12个月访问人数 按部门",
"prior_schema_links": """[]""",
"analysis": """让我们一步一步地思考。在问题“内容库近12个月访问人数 按部门“中,我们被问: "analysis": """让我们一步一步地思考。在问题“内容库近12个月访问人数 按部门“中,我们被问:
”内容库近12个月“所以我们需要column=[数据日期] ”内容库近12个月“所以我们需要column=[数据日期]
“访问人数”所以我们需要column=[访问人数] “访问人数”所以我们需要column=[访问人数]
”按部门“所以我们需要column=[部门] ”按部门“所以我们需要column=[部门]
基于table和columns可能的cell values 是 = [12]。""", 基于table和columns可能的cell values 是 = [12]。""",
"schema_links":"""["访问人数", "部门", "数据日期", 12]""", "schema_links": """["访问人数", "部门", "数据日期", 12]""",
"sql":"""select 部门, 数据日期, 访问人数 from 内容库产品 where datediff('month', 数据日期, '2022-11-06') <= 12 """ "sql": """select 部门, 数据日期, 访问人数 from 内容库产品 where datediff('month', 数据日期, '2022-11-06') <= 12 """,
}, },
{ "current_date":"2023-04-21", {
"table_name":"内容库产品", "current_date": "2023-04-21",
"fields_list":"""["部门", "模块", "用户名", "访问次数", "访问人数", "访问时长", "数据日期"]""", "table_name": "内容库产品",
"question":"内容库美术部、技术研发部的访问时长", "fields_list": """["部门", "模块", "用户名", "访问次数", "访问人数", "访问时长", "数据日期"]""",
"prior_schema_links":"""['美术部'->部门, '技术研发部'->部门]""", "question": "内容库美术部、技术研发部的访问时长",
"prior_schema_links": """['美术部'->部门, '技术研发部'->部门]""",
"analysis": """让我们一步一步地思考。在问题“内容库美术部、技术研发部的访问时长“中,我们被问: "analysis": """让我们一步一步地思考。在问题“内容库美术部、技术研发部的访问时长“中,我们被问:
“访问时长”所以我们需要column=[访问时长] “访问时长”所以我们需要column=[访问时长]
”内容库美术部、技术研发部“所以我们需要column=[部门] ”内容库美术部、技术研发部“所以我们需要column=[部门]
基于table和columns可能的cell values 是 = ['美术部', '技术研发部']。""", 基于table和columns可能的cell values 是 = ['美术部', '技术研发部']。""",
"schema_links":"""["访问时长", "部门", "'美术部'", "'技术研发部'"]""", "schema_links": """["访问时长", "部门", "'美术部'", "'技术研发部'"]""",
"sql":"""select 部门, 访问时长 from 内容库产品 where 部门 in ('美术部', '技术研发部') and 数据日期 = '2023-04-21' """ "sql": """select 部门, 访问时长 from 内容库产品 where 部门 in ('美术部', '技术研发部') and 数据日期 = '2023-04-21' """,
}, },
{ "current_date":"2023-08-21", {
"table_name":"严选", "current_date": "2023-08-21",
"fields_list":"""["严选版权归属系", "付费模式", "结算播放份额", "付费用户结算播放份额", "数据日期"]""", "table_name": "严选",
"question":"近3天海田飞系MPPM结算播放份额", "fields_list": """["严选版权归属系", "付费模式", "结算播放份额", "付费用户结算播放份额", "数据日期"]""",
"prior_schema_links":"""['海田飞系'->严选版权归属系]""", "question": "近3天海田飞系MPPM结算播放份额",
"prior_schema_links": """['海田飞系'->严选版权归属系]""",
"analysis": """让我们一步一步地思考。在问题“近3天海田飞系MPPM结算播放份额“中我们被问 "analysis": """让我们一步一步地思考。在问题“近3天海田飞系MPPM结算播放份额“中我们被问
“MPPM结算播放份额”所以我们需要column=[结算播放份额] “MPPM结算播放份额”所以我们需要column=[结算播放份额]
”海田飞系“所以我们需要column=[严选版权归属系] ”海田飞系“所以我们需要column=[严选版权归属系]
”近3天“所以我们需要column=[数据日期] ”近3天“所以我们需要column=[数据日期]
基于table和columns可能的cell values 是 = ['海田飞系', 3]。""", 基于table和columns可能的cell values 是 = ['海田飞系', 3]。""",
"schema_links":"""["结算播放份额", "严选版权归属系", "数据日期", "'海田飞系'", 3]""", "schema_links": """["结算播放份额", "严选版权归属系", "数据日期", "'海田飞系'", 3]""",
"sql":"""select 严选版权归属系, 结算播放份额 from 严选 where 严选版权归属系 = '海田飞系' and datediff('day', 数据日期, '2023-08-21') <= 3 """ "sql": """select 严选版权归属系, 结算播放份额 from 严选 where 严选版权归属系 = '海田飞系' and datediff('day', 数据日期, '2023-08-21') <= 3 """,
}, },
{ "current_date":"2023-05-22", {
"table_name":"歌曲库", "current_date": "2023-05-22",
"fields_list":"""["是否潮流人歌曲", "C音歌曲ID", "C音歌曲MID", "歌曲名", "歌曲版本", "语种", "歌曲类型", "翻唱类型", "MPPM歌曲ID", "是否严选窄口径歌曲", "是否严选宽口径歌曲", "结算播放量", "运营播放量", "付费用户结算播放量", "历史累计结算播放量", "运营搜播量", "结算搜播量", "运营完播量", "运营推播量", "近7日复播率", "日均搜播量", "数据日期"]""", "table_name": "歌曲库",
"question":"对比近7天翻唱版和纯音乐的歌曲播放量", "fields_list": """["是否潮流人歌曲", "C音歌曲ID", "C音歌曲MID", "歌曲名", "歌曲版本", "语种", "歌曲类型", "翻唱类型", "MPPM歌曲ID", "是否严选窄口径歌曲", "是否严选宽口径歌曲", "结算播放量", "运营播放量", "付费用户结算播放量", "历史累计结算播放量", "运营搜播量", "结算搜播量", "运营完播量", "运营推播量", "近7日复播率", "日均搜播量", "数据日期"]""",
"prior_schema_links":"""['纯音乐'->语种, '翻唱版'->歌曲版本]""", "question": "对比近7天翻唱版和纯音乐的歌曲播放量",
"prior_schema_links": """['纯音乐'->语种, '翻唱版'->歌曲版本]""",
"analysis": """让我们一步一步地思考。在问题“对比近3天翻唱版和纯音乐的歌曲播放量“中我们被问 "analysis": """让我们一步一步地思考。在问题“对比近3天翻唱版和纯音乐的歌曲播放量“中我们被问
“歌曲播放量”所以我们需要column=[结算播放量] “歌曲播放量”所以我们需要column=[结算播放量]
”翻唱版“所以我们需要column=[歌曲版本] ”翻唱版“所以我们需要column=[歌曲版本]
”和纯音乐的歌曲“所以我们需要column=[语种] ”和纯音乐的歌曲“所以我们需要column=[语种]
”近7天“所以我们需要column=[数据日期] ”近7天“所以我们需要column=[数据日期]
基于table和columns可能的cell values 是 = ['翻唱版', '纯音乐', 7]。""", 基于table和columns可能的cell values 是 = ['翻唱版', '纯音乐', 7]。""",
"schema_links":"""["结算播放量", "歌曲版本", "语种", "数据日期", "'翻唱版'", "'纯音乐'", 7]""", "schema_links": """["结算播放量", "歌曲版本", "语种", "数据日期", "'翻唱版'", "'纯音乐'", 7]""",
"sql":"""select 歌曲版本, 语种, 结算播放量 from 歌曲库 where 歌曲版本 = '翻唱版' and 语种 = '纯音乐' and datediff('day', 数据日期, '2023-05-22') <= 7 """ "sql": """select 歌曲版本, 语种, 结算播放量 from 歌曲库 where 歌曲版本 = '翻唱版' and 语种 = '纯音乐' and datediff('day', 数据日期, '2023-05-22') <= 7 """,
}, },
{ "current_date":"2023-05-31", {
"table_name":"艺人库", "current_date": "2023-05-31",
"fields_list":"""["上下架状态", "歌手名", "歌手等级", "歌手类型", "歌手来源", "MPPM潮流人等级", "活跃区域", "年龄", "歌手才能", "歌手风格", "粉丝数", "潮音粉丝数", "超声波粉丝数", "推博粉丝数", "超声波歌曲数", "在架歌曲数", "超声波分享数", "独占歌曲数", "超声波在架歌曲评论数", "有播放量歌曲数", "数据日期"]""", "table_name": "艺人库",
"question":"对比一下陈拙悬、孟梅琦、赖媚韵的粉丝数", "fields_list": """["上下架状态", "歌手名", "歌手等级", "歌手类型", "歌手来源", "MPPM潮流人等级", "活跃区域", "年龄", "歌手才能", "歌手风格", "粉丝数", "潮音粉丝数", "超声波粉丝数", "推博粉丝数", "超声波歌曲数", "在架歌曲数", "超声波分享数", "独占歌曲数", "超声波在架歌曲评论数", "有播放量歌曲数", "数据日期"]""",
"prior_schema_links":"""['1527896'->MPPM歌手ID, '1565463'->MPPM歌手ID, '2141459'->MPPM歌手ID]""", "question": "对比一下陈拙悬、孟梅琦、赖媚韵的粉丝数",
"prior_schema_links": """['1527896'->MPPM歌手ID, '1565463'->MPPM歌手ID, '2141459'->MPPM歌手ID]""",
"analysis": """让我们一步一步地思考。在问题“对比一下陈拙悬、孟梅琦、赖媚韵的粉丝数“中,我们被问: "analysis": """让我们一步一步地思考。在问题“对比一下陈拙悬、孟梅琦、赖媚韵的粉丝数“中,我们被问:
“粉丝数”所以我们需要column=[粉丝数] “粉丝数”所以我们需要column=[粉丝数]
”陈拙悬、孟梅琦、赖媚韵“所以我们需要column=[歌手名] ”陈拙悬、孟梅琦、赖媚韵“所以我们需要column=[歌手名]
基于table和columns可能的cell values 是 = ['陈拙悬', '孟梅琦', '赖媚韵']。""", 基于table和columns可能的cell values 是 = ['陈拙悬', '孟梅琦', '赖媚韵']。""",
"schema_links":"""["粉丝数", "歌手名", "'陈拙悬'", "'孟梅琦'", "'赖媚韵'"]""", "schema_links": """["粉丝数", "歌手名", "'陈拙悬'", "'孟梅琦'", "'赖媚韵'"]""",
"sql":"""select 歌手名, 粉丝数 from 艺人库 where 歌手名 in ('陈拙悬', '孟梅琦', '赖媚韵') and 数据日期 = '2023-05-31' """ "sql": """select 歌手名, 粉丝数 from 艺人库 where 歌手名 in ('陈拙悬', '孟梅琦', '赖媚韵') and 数据日期 = '2023-05-31' """,
}, },
{ "current_date":"2023-07-31", {
"table_name":"歌曲库", "current_date": "2023-07-31",
"fields_list":"""["歌曲名", "歌曲版本", "歌曲类型", "MPPM歌曲ID", "是否严选窄口径歌曲", "是否严选宽口径歌曲", "是否潮流人歌曲", "超声波歌曲ID", "C音歌曲ID", "C音歌曲MID", "结算播放量", "运营播放量", "分享量", "收藏量", "运营搜播量", "结算搜播量", "拉新用户数", "拉活用户数", "分享率", "结算播放份额", "数据日期"]""", "table_name": "歌曲库",
"question":"播放量大于1万的歌曲有多少", "fields_list": """["歌曲名", "歌曲版本", "歌曲类型", "MPPM歌曲ID", "是否严选窄口径歌曲", "是否严选宽口径歌曲", "是否潮流人歌曲", "超声波歌曲ID", "C音歌曲ID", "C音歌曲MID", "结算播放量", "运营播放量", "分享量", "收藏量", "运营搜播量", "结算搜播量", "拉新用户数", "拉活用户数", "分享率", "结算播放份额", "数据日期"]""",
"prior_schema_links":"""[]""", "question": "播放量大于1万的歌曲有多少",
"prior_schema_links": """[]""",
"analysis": """让我们一步一步地思考。在问题“播放量大于1万的歌曲有多少“中我们被问 "analysis": """让我们一步一步地思考。在问题“播放量大于1万的歌曲有多少“中我们被问
“歌曲有多少”所以我们需要column=[歌曲名] “歌曲有多少”所以我们需要column=[歌曲名]
”播放量大于1万的“所以我们需要column=[结算播放量] ”播放量大于1万的“所以我们需要column=[结算播放量]
基于table和columns可能的cell values 是 = [10000]。""", 基于table和columns可能的cell values 是 = [10000]。""",
"schema_links":"""["歌曲名", "结算播放量", 10000]""", "schema_links": """["歌曲名", "结算播放量", 10000]""",
"sql":"""select 歌曲名 from 歌曲库 where 结算播放量 > 10000 and 数据日期 = '2023-07-31' """ "sql": """select 歌曲名 from 歌曲库 where 结算播放量 > 10000 and 数据日期 = '2023-07-31' """,
}, },
{ "current_date":"2023-07-31", {
"table_name":"内容库产品", "current_date": "2023-07-31",
"fields_list":"""["用户名", "部门", "模块", "访问时长", "访问次数", "访问人数", "数据日期"]""", "table_name": "内容库产品",
"question":"内容库访问时长小于1小时且来自美术部的用户是哪些", "fields_list": """["用户名", "部门", "模块", "访问时长", "访问次数", "访问人数", "数据日期"]""",
"prior_schema_links":"""['美术部'->部门]""", "question": "内容库访问时长小于1小时且来自美术部的用户是哪些",
"prior_schema_links": """['美术部'->部门]""",
"analysis": """让我们一步一步地思考。在问题“内容库访问时长小于1小时且来自美术部的用户是哪些“中我们被问 "analysis": """让我们一步一步地思考。在问题“内容库访问时长小于1小时且来自美术部的用户是哪些“中我们被问
“用户是哪些”所以我们需要column=[用户名] “用户是哪些”所以我们需要column=[用户名]
”美术部的“所以我们需要column=[部门] ”美术部的“所以我们需要column=[部门]
”访问时长小于1小时“所以我们需要column=[访问时长] ”访问时长小于1小时“所以我们需要column=[访问时长]
基于table和columns可能的cell values 是 = ['美术部', 1]。""", 基于table和columns可能的cell values 是 = ['美术部', 1]。""",
"schema_links":"""["用户名", "部门", "访问时长", "'美术部'", 1]""", "schema_links": """["用户名", "部门", "访问时长", "'美术部'", 1]""",
"sql":"""select 用户名 from 内容库产品 where 部门 = '美术部' and 访问时长 < 1 and 数据日期 = '2023-07-31' """ "sql": """select 用户名 from 内容库产品 where 部门 = '美术部' and 访问时长 < 1 and 数据日期 = '2023-07-31' """,
}, },
{ "current_date":"2023-08-31", {
"table_name":"内容库产品", "current_date": "2023-08-31",
"fields_list":"""["用户名", "部门", "模块", "访问时长", "访问次数", "访问人数", "数据日期"]""", "table_name": "内容库产品",
"question":"内容库pv最高的用户有哪些", "fields_list": """["用户名", "部门", "模块", "访问时长", "访问次数", "访问人数", "数据日期"]""",
"prior_schema_links":"""[]""", "question": "内容库pv最高的用户有哪些",
"prior_schema_links": """[]""",
"analysis": """让我们一步一步地思考。在问题“内容库pv最高的用户有哪些“中我们被问 "analysis": """让我们一步一步地思考。在问题“内容库pv最高的用户有哪些“中我们被问
“用户有哪些”所以我们需要column=[用户名] “用户有哪些”所以我们需要column=[用户名]
”pv最高的“所以我们需要column=[访问次数] ”pv最高的“所以我们需要column=[访问次数]
基于table和columns可能的cell values 是 = []。""", 基于table和columns可能的cell values 是 = []。""",
"schema_links":"""["用户名", "访问次数"]""", "schema_links": """["用户名", "访问次数"]""",
"sql":"""select 用户名 from 内容库产品 where 数据日期 = '2023-08-31' order by 访问次数 desc limit 10 """ "sql": """select 用户名 from 内容库产品 where 数据日期 = '2023-08-31' order by 访问次数 desc limit 10 """,
}, },
{ "current_date":"2023-08-31", {
"table_name":"艺人库", "current_date": "2023-08-31",
"fields_list":"""["播放量层级", "播放量单调性", "播放量方差", "播放量突增类型", "播放量集中度", "歌手名", "歌手等级", "歌手类型", "歌手来源", "MPPM潮流人等级", "结算播放量", "运营播放量", "历史累计结算播放量", "有播放量歌曲数", "历史累计运营播放量", "付费用户结算播放量", "结算播放量占比", "运营播放份额", "免费用户结算播放占比", "完播量", "数据日期"]""", "table_name": "艺人库",
"question":"近90天袁亚伟播放量平均值是多少", "fields_list": """["播放量层级", "播放量单调性", "播放量方差", "播放量突增类型", "播放量集中度", "歌手名", "歌手等级", "歌手类型", "歌手来源", "MPPM潮流人等级", "结算播放量", "运营播放量", "历史累计结算播放量", "有播放量歌曲数", "历史累计运营播放量", "付费用户结算播放量", "结算播放量占比", "运营播放份额", "免费用户结算播放占比", "完播量", "数据日期"]""",
"prior_schema_links":"""['152789226'->MPPM歌手ID]""", "question": "近90天袁亚伟播放量平均值是多少",
"prior_schema_links": """['152789226'->MPPM歌手ID]""",
"analysis": """让我们一步一步地思考。在问题“近90天袁亚伟播放量平均值是多少“中我们被问 "analysis": """让我们一步一步地思考。在问题“近90天袁亚伟播放量平均值是多少“中我们被问
“播放量平均值是多少”所以我们需要column=[结算播放量] “播放量平均值是多少”所以我们需要column=[结算播放量]
”袁亚伟“所以我们需要column=[歌手名] ”袁亚伟“所以我们需要column=[歌手名]
”近90天“所以我们需要column=[数据日期] ”近90天“所以我们需要column=[数据日期]
基于table和columns可能的cell values 是 = ['袁亚伟', 90]。""", 基于table和columns可能的cell values 是 = ['袁亚伟', 90]。""",
"schema_links":"""["结算播放量", "歌手名", "数据日期", "'袁亚伟'", 90]""", "schema_links": """["结算播放量", "歌手名", "数据日期", "'袁亚伟'", 90]""",
"sql":"""select avg(结算播放量) from 艺人库 where 歌手名 = '袁亚伟' and datediff('day', 数据日期, '2023-08-31') <= 90 """ "sql": """select avg(结算播放量) from 艺人库 where 歌手名 = '袁亚伟' and datediff('day', 数据日期, '2023-08-31') <= 90 """,
}, },
{ "current_date":"2023-08-31", {
"table_name":"艺人库", "current_date": "2023-08-31",
"fields_list":"""["播放量层级", "播放量单调性", "播放量方差", "播放量突增类型", "播放量集中度", "歌手名", "歌手等级", "歌手类型", "歌手来源", "MPPM潮流人等级", "结算播放量", "运营播放量", "历史累计结算播放量", "有播放量歌曲数", "历史累计运营播放量", "付费用户结算播放量", "结算播放量占比", "运营播放份额", "免费用户结算播放占比", "完播量", "数据日期"]""", "table_name": "艺人库",
"question":"周倩倩近7天结算播放量总和是多少", "fields_list": """["播放量层级", "播放量单调性", "播放量方差", "播放量突增类型", "播放量集中度", "歌手名", "歌手等级", "歌手类型", "歌手来源", "MPPM潮流人等级", "结算播放量", "运营播放量", "历史累计结算播放量", "有播放量歌曲数", "历史累计运营播放量", "付费用户结算播放量", "结算播放量占比", "运营播放份额", "免费用户结算播放占比", "完播量", "数据日期"]""",
"prior_schema_links":"""['199509'->MPPM歌手ID]""", "question": "周倩倩近7天结算播放量总和是多少",
"prior_schema_links": """['199509'->MPPM歌手ID]""",
"analysis": """让我们一步一步地思考。在问题“周倩倩近7天结算播放量总和是多少“中我们被问 "analysis": """让我们一步一步地思考。在问题“周倩倩近7天结算播放量总和是多少“中我们被问
“结算播放量总和是多少”所以我们需要column=[结算播放量] “结算播放量总和是多少”所以我们需要column=[结算播放量]
”周倩倩“所以我们需要column=[歌手名] ”周倩倩“所以我们需要column=[歌手名]
”近7天“所以我们需要column=[数据日期] ”近7天“所以我们需要column=[数据日期]
基于table和columns可能的cell values 是 = ['周倩倩', 7]。""", 基于table和columns可能的cell values 是 = ['周倩倩', 7]。""",
"schema_links":"""["结算播放量", "歌手名", "数据日期", "'周倩倩'", 7]""", "schema_links": """["结算播放量", "歌手名", "数据日期", "'周倩倩'", 7]""",
"sql":"""select sum(结算播放量) from 艺人库 where 歌手名 = '周倩倩' and datediff('day', 数据日期, '2023-08-31') <= 7 """ "sql": """select sum(结算播放量) from 艺人库 where 歌手名 = '周倩倩' and datediff('day', 数据日期, '2023-08-31') <= 7 """,
}, },
{ "current_date":"2023-09-14", {
"table_name":"内容库产品", "current_date": "2023-09-14",
"fields_list":"""["部门", "模块", "用户名", "访问次数", "访问人数", "访问时长", "数据日期"]""", "table_name": "内容库产品",
"question":"内容库访问次数大于1k的部门是哪些", "fields_list": """["部门", "模块", "用户名", "访问次数", "访问人数", "访问时长", "数据日期"]""",
"prior_schema_links":"""[]""", "question": "内容库访问次数大于1k的部门是哪些",
"prior_schema_links": """[]""",
"analysis": """让我们一步一步地思考。在问题“内容库访问次数大于1k的部门是哪些“中我们被问 "analysis": """让我们一步一步地思考。在问题“内容库访问次数大于1k的部门是哪些“中我们被问
“部门是哪些”所以我们需要column=[部门] “部门是哪些”所以我们需要column=[部门]
”访问次数大于1k的“所以我们需要column=[访问次数] ”访问次数大于1k的“所以我们需要column=[访问次数]
基于table和columns可能的cell values 是 = [1000]。""", 基于table和columns可能的cell values 是 = [1000]。""",
"schema_links":"""["部门", "访问次数", 1000]""", "schema_links": """["部门", "访问次数", 1000]""",
"sql":"""select 部门 from 内容库产品 where 访问次数 > 1000 and 数据日期 = '2023-09-14' """ "sql": """select 部门 from 内容库产品 where 访问次数 > 1000 and 数据日期 = '2023-09-14' """,
}, },
{ "current_date":"2023-09-18", {
"table_name":"歌曲库", "current_date": "2023-09-18",
"fields_list":"""["歌曲名", "MPPM歌手ID", "歌曲版本", "歌曲类型", "MPPM歌曲ID", "是否严选窄口径歌曲", "是否严选宽口径歌曲", "是否潮流人歌曲", "超声波歌曲ID", "C音歌曲ID", "C音歌曲MID", "结算播放量", "运营播放量", "分享量", "收藏量", "运营搜播量", "结算搜播量", "拉新用户数", "拉活用户数", "分享率", "结算播放份额", "数据日期"]""", "table_name": "歌曲库",
"question":"陈亿训唱的所有的播放量大于20k的孤勇者有哪些", "fields_list": """["歌曲名", "MPPM歌手ID", "歌曲版本", "歌曲类型", "MPPM歌曲ID", "是否严选窄口径歌曲", "是否严选宽口径歌曲", "是否潮流人歌曲", "超声波歌曲ID", "C音歌曲ID", "C音歌曲MID", "结算播放量", "运营播放量", "分享量", "收藏量", "运营搜播量", "结算搜播量", "拉新用户数", "拉活用户数", "分享率", "结算播放份额", "数据日期"]""",
"prior_schema_links":"""['199509'->MPPM歌手ID, '1527123'->MPPM歌曲ID]""", "question": "陈亿训唱的所有的播放量大于20k的孤勇者有哪些",
"prior_schema_links": """['199509'->MPPM歌手ID, '1527123'->MPPM歌曲ID]""",
"analysis": """让我们一步一步地思考。在问题“陈亿训唱的所有的播放量大于20k的孤勇者有哪些“中我们被问 "analysis": """让我们一步一步地思考。在问题“陈亿训唱的所有的播放量大于20k的孤勇者有哪些“中我们被问
“孤勇者有哪些”所以我们需要column=[歌曲名] “孤勇者有哪些”所以我们需要column=[歌曲名]
”播放量大于20k的“所以我们需要column=[结算播放量] ”播放量大于20k的“所以我们需要column=[结算播放量]
”陈亿训唱的“所以我们需要column=[歌手名] ”陈亿训唱的“所以我们需要column=[歌手名]
基于table和columns可能的cell values 是 = [20000, '陈亿训', '孤勇者']。""", 基于table和columns可能的cell values 是 = [20000, '陈亿训', '孤勇者']。""",
"schema_links":"""["歌曲名", "结算播放量", "歌手名", 20000, "'陈亿训'", "'孤勇者'"]""", "schema_links": """["歌曲名", "结算播放量", "歌手名", 20000, "'陈亿训'", "'孤勇者'"]""",
"sql":"""select 歌曲名 from 歌曲库 where 结算播放量 > 20000 and 歌手名 = '陈亿训' and 歌曲名 = '孤勇者' and 数据日期 = '2023-09-18' """ "sql": """select 歌曲名 from 歌曲库 where 结算播放量 > 20000 and 歌手名 = '陈亿训' and 歌曲名 = '孤勇者' and 数据日期 = '2023-09-18' """,
}, },
{ "current_date":"2023-09-18", {
"table_name":"歌曲库", "current_date": "2023-09-18",
"fields_list":"""["歌曲名", "歌曲版本", "歌手名", "歌曲类型", "发布时间", "MPPM歌曲ID", "是否严选窄口径歌曲", "是否严选宽口径歌曲", "是否潮流人歌曲", "超声波歌曲ID", "C音歌曲ID", "C音歌曲MID", "结算播放量", "运营播放量", "分享量", "收藏量", "运营搜播量", "结算搜播量", "拉新用户数", "拉活用户数", "分享率", "结算播放份额", "数据日期"]""", "table_name": "歌曲库",
"question":"周洁轮去年发布的歌曲有哪些", "fields_list": """["歌曲名", "歌曲版本", "歌手名", "歌曲类型", "发布时间", "MPPM歌曲ID", "是否严选窄口径歌曲", "是否严选宽口径歌曲", "是否潮流人歌曲", "超声波歌曲ID", "C音歌曲ID", "C音歌曲MID", "结算播放量", "运营播放量", "分享量", "收藏量", "运营搜播量", "结算搜播量", "拉新用户数", "拉活用户数", "分享率", "结算播放份额", "数据日期"]""",
"prior_schema_links":"""['23109'->MPPM歌手ID]""", "question": "周洁轮去年发布的歌曲有哪些",
"prior_schema_links": """['23109'->MPPM歌手ID]""",
"analysis": """让我们一步一步地思考。在问题“周洁轮去年发布的歌曲有哪些“中,我们被问: "analysis": """让我们一步一步地思考。在问题“周洁轮去年发布的歌曲有哪些“中,我们被问:
“歌曲有哪些”所以我们需要column=[歌曲名] “歌曲有哪些”所以我们需要column=[歌曲名]
”去年发布的“所以我们需要column=[发布时间] ”去年发布的“所以我们需要column=[发布时间]
”周洁轮“所以我们需要column=[歌手名] ”周洁轮“所以我们需要column=[歌手名]
基于table和columns可能的cell values 是 = ['周洁轮', 1]。""", 基于table和columns可能的cell values 是 = ['周洁轮', 1]。""",
"schema_links":"""["歌曲名", "发布时间", "歌手名", 1, "'周洁轮'"]""", "schema_links": """["歌曲名", "发布时间", "歌手名", 1, "'周洁轮'"]""",
"sql":"""select 歌曲名 from 歌曲库 where datediff('year', 发布时间, '2023-09-18') <= 1 and 歌手名 = '周洁轮' and 数据日期 = '2023-09-18' """ "sql": """select 歌曲名 from 歌曲库 where datediff('year', 发布时间, '2023-09-18') <= 1 and 歌手名 = '周洁轮' and 数据日期 = '2023-09-18' """,
}, },
{ "current_date":"2023-09-11", {
"table_name":"艺人库", "current_date": "2023-09-11",
"fields_list":"""["播放量层级", "播放量单调性", "播放量方差", "播放量突增类型", "播放量集中度", "歌手名", "歌手等级", "歌手类型", "歌手来源", "签约日期", "MPPM潮流人等级", "结算播放量", "运营播放量", "历史累计结算播放量", "有播放量歌曲数", "历史累计运营播放量", "付费用户结算播放量", "结算播放量占比", "运营播放份额", "免费用户结算播放占比", "完播量", "数据日期"]""", "table_name": "艺人库",
"question":"我想要近半年签约的播放量前十的歌手有哪些", "fields_list": """["播放量层级", "播放量单调性", "播放量方差", "播放量突增类型", "播放量集中度", "歌手名", "歌手等级", "歌手类型", "歌手来源", "签约日期", "MPPM潮流人等级", "结算播放量", "运营播放量", "历史累计结算播放量", "有播放量歌曲数", "历史累计运营播放量", "付费用户结算播放量", "结算播放量占比", "运营播放份额", "免费用户结算播放占比", "完播量", "数据日期"]""",
"prior_schema_links":"""[]""", "question": "我想要近半年签约的播放量前十的歌手有哪些",
"prior_schema_links": """[]""",
"analysis": """让我们一步一步地思考。在问题“我想要近半年签约的播放量前十的歌手“中,我们被问: "analysis": """让我们一步一步地思考。在问题“我想要近半年签约的播放量前十的歌手“中,我们被问:
“歌手有哪些”所以我们需要column=[歌手名] “歌手有哪些”所以我们需要column=[歌手名]
”播放量前十的“所以我们需要column=[结算播放量] ”播放量前十的“所以我们需要column=[结算播放量]
”近半年签约的“所以我们需要column=[签约日期] ”近半年签约的“所以我们需要column=[签约日期]
基于table和columns可能的cell values 是 = [0.5, 10]。""", 基于table和columns可能的cell values 是 = [0.5, 10]。""",
"schema_links":"""["歌手名", "结算播放量", "签约日期", 0.5, 10]""", "schema_links": """["歌手名", "结算播放量", "签约日期", 0.5, 10]""",
"sql":"""select 歌手名 from 艺人库 where datediff('year', 签约日期, '2023-09-11') <= 0.5 and 数据日期 = '2023-09-11' order by 结算播放量 desc limit 10""" "sql": """select 歌手名 from 艺人库 where datediff('year', 签约日期, '2023-09-11') <= 0.5 and 数据日期 = '2023-09-11' order by 结算播放量 desc limit 10""",
}, },
{ "current_date":"2023-08-12", {
"table_name":"歌曲库", "current_date": "2023-08-12",
"table_name": "歌曲库",
"fields_list": """["发行日期", "歌曲语言", "歌曲来源", "歌曲流派", "歌曲名", "歌曲版本", "歌曲类型", "发行时间", "数据日期"]""", "fields_list": """["发行日期", "歌曲语言", "歌曲来源", "歌曲流派", "歌曲名", "歌曲版本", "歌曲类型", "发行时间", "数据日期"]""",
"question":"最近一年发行的歌曲中有哪些在近7天播放超过一千万的", "question": "最近一年发行的歌曲中有哪些在近7天播放超过一千万的",
"prior_schema_links":"""[]""", "prior_schema_links": """[]""",
"analysis": """让我们一步一步地思考。在问题“最近一年发行的歌曲中有哪些在近7天播放超过一千万的“中我们被问 "analysis": """让我们一步一步地思考。在问题“最近一年发行的歌曲中有哪些在近7天播放超过一千万的“中我们被问
“发行的歌曲中有哪些”所以我们需要column=[歌曲名] “发行的歌曲中有哪些”所以我们需要column=[歌曲名]
”最近一年发行的“所以我们需要column=[发行日期] ”最近一年发行的“所以我们需要column=[发行日期]
”在近7天播放超过一千万的“所以我们需要column=[数据日期, 结算播放量] ”在近7天播放超过一千万的“所以我们需要column=[数据日期, 结算播放量]
基于table和columns可能的cell values 是 = [1, 10000000]""", 基于table和columns可能的cell values 是 = [1, 10000000]""",
"schema_links":"""["歌曲名", "发行日期", "数据日期", "结算播放量", 1, 10000000]""", "schema_links": """["歌曲名", "发行日期", "数据日期", "结算播放量", 1, 10000000]""",
"sql":"""select 歌曲名 from 歌曲库 where datediff('year', 发行日期, '2023-08-12') <= 1 and datediff('day', 数据日期, '2023-08-12') <= 7 and 结算播放量 > 10000000""" "sql": """select 歌曲名 from 歌曲库 where datediff('year', 发行日期, '2023-08-12') <= 1 and datediff('day', 数据日期, '2023-08-12') <= 7 and 结算播放量 > 10000000""",
}, },
{ "current_date":"2023-08-12", {
"table_name":"歌曲库", "current_date": "2023-08-12",
"table_name": "歌曲库",
"fields_list": """["发行日期", "歌曲语言", "歌曲来源", "歌曲流派", "歌曲名", "歌曲版本", "歌曲类型", "发行时间", "数据日期"]""", "fields_list": """["发行日期", "歌曲语言", "歌曲来源", "歌曲流派", "歌曲名", "歌曲版本", "歌曲类型", "发行时间", "数据日期"]""",
"question":"今年以来发行的歌曲中有哪些在近7天播放超过一千万的", "question": "今年以来发行的歌曲中有哪些在近7天播放超过一千万的",
"prior_schema_links":"""[]""", "prior_schema_links": """[]""",
"analysis": """让我们一步一步地思考。在问题“今年以来发行的歌曲中有哪些在近7天播放超过一千万的“中我们被问 "analysis": """让我们一步一步地思考。在问题“今年以来发行的歌曲中有哪些在近7天播放超过一千万的“中我们被问
“发行的歌曲中有哪些”所以我们需要column=[歌曲名] “发行的歌曲中有哪些”所以我们需要column=[歌曲名]
”今年以来发行的“所以我们需要column=[发行日期] ”今年以来发行的“所以我们需要column=[发行日期]
”在近7天播放超过一千万的“所以我们需要column=[数据日期, 结算播放量] ”在近7天播放超过一千万的“所以我们需要column=[数据日期, 结算播放量]
基于table和columns可能的cell values 是 = [0, 7, 10000000]""", 基于table和columns可能的cell values 是 = [0, 7, 10000000]""",
"schema_links":"""["歌曲名", "发行日期", "数据日期", "结算播放量", 0, 7, 10000000]""", "schema_links": """["歌曲名", "发行日期", "数据日期", "结算播放量", 0, 7, 10000000]""",
"sql":"""select 歌曲名 from 歌曲库 where datediff('year', 发行日期, '2023-08-12') <= 0 and datediff('day', 数据日期, '2023-08-12') <= 7 and 结算播放量 > 10000000""" "sql": """select 歌曲名 from 歌曲库 where datediff('year', 发行日期, '2023-08-12') <= 0 and datediff('day', 数据日期, '2023-08-12') <= 7 and 结算播放量 > 10000000""",
}, },
{ "current_date":"2023-08-12", {
"table_name":"歌曲库", "current_date": "2023-08-12",
"table_name": "歌曲库",
"fields_list": """["发行日期", "歌曲语言", "歌曲来源", "歌曲流派", "歌曲名", "歌曲版本", "歌曲类型", "发行时间", "数据日期"]""", "fields_list": """["发行日期", "歌曲语言", "歌曲来源", "歌曲流派", "歌曲名", "歌曲版本", "歌曲类型", "发行时间", "数据日期"]""",
"question":"2023年以来发行的歌曲中有哪些在近7天播放超过一千万的", "question": "2023年以来发行的歌曲中有哪些在近7天播放超过一千万的",
"prior_schema_links":"""['514129144'->MPPM歌曲ID]""", "prior_schema_links": """['514129144'->MPPM歌曲ID]""",
"analysis": """让我们一步一步地思考。在问题“2023年以来发行的歌曲中有哪些在近7天播放超过一千万的“中我们被问 "analysis": """让我们一步一步地思考。在问题“2023年以来发行的歌曲中有哪些在近7天播放超过一千万的“中我们被问
“发行的歌曲中有哪些”所以我们需要column=[歌曲名] “发行的歌曲中有哪些”所以我们需要column=[歌曲名]
”2023年以来发行的“所以我们需要column=[发行日期] ”2023年以来发行的“所以我们需要column=[发行日期]
”在近7天播放超过一千万的“所以我们需要column=[数据日期, 结算播放量] ”在近7天播放超过一千万的“所以我们需要column=[数据日期, 结算播放量]
基于table和columns可能的cell values 是 = [2023, 7, 10000000]""", 基于table和columns可能的cell values 是 = [2023, 7, 10000000]""",
"schema_links":"""["歌曲名", "发行日期", "数据日期", "结算播放量", 2023, 7, 10000000]""", "schema_links": """["歌曲名", "发行日期", "数据日期", "结算播放量", 2023, 7, 10000000]""",
"sql":"""select 歌曲名 from 歌曲库 where YEAR(发行日期) >= 2023 and datediff('day', 数据日期, '2023-08-12') <= 7 and 结算播放量 > 10000000""" "sql": """select 歌曲名 from 歌曲库 where YEAR(发行日期) >= 2023 and datediff('day', 数据日期, '2023-08-12') <= 7 and 结算播放量 > 10000000""",
}, },
{ "current_date":"2023-08-01", {
"table_name":"歌曲库", "current_date": "2023-08-01",
"fields_list":"""["歌曲名", "歌曲版本", "歌手名", "歌曲类型", "发布时间", "MPPM歌曲ID", "是否严选窄口径歌曲", "是否严选宽口径歌曲", "是否潮流人歌曲", "超声波歌曲ID", "C音歌曲ID", "C音歌曲MID", "结算播放量", "运营播放量", "分享量", "收藏量", "运营搜播量", "结算搜播量", "拉新用户数", "拉活用户数", "分享率", "结算播放份额", "数据日期"]""", "table_name": "歌曲库",
"question":"周洁轮2023年6月之后发布的歌曲有哪些", "fields_list": """["歌曲名", "歌曲版本", "歌手名", "歌曲类型", "发布时间", "MPPM歌曲ID", "是否严选窄口径歌曲", "是否严选宽口径歌曲", "是否潮流人歌曲", "超声波歌曲ID", "C音歌曲ID", "C音歌曲MID", "结算播放量", "运营播放量", "分享量", "收藏量", "运营搜播量", "结算搜播量", "拉新用户数", "拉活用户数", "分享率", "结算播放份额", "数据日期"]""",
"prior_schema_links":"""['23109'->MPPM歌手ID]""", "question": "周洁轮2023年6月之后发布的歌曲有哪些",
"prior_schema_links": """['23109'->MPPM歌手ID]""",
"analysis": """让我们一步一步地思考。在问题“周洁轮2023年6月之后发布的歌曲有哪些“中我们被问 "analysis": """让我们一步一步地思考。在问题“周洁轮2023年6月之后发布的歌曲有哪些“中我们被问
“歌曲有哪些”所以我们需要column=[歌曲名] “歌曲有哪些”所以我们需要column=[歌曲名]
”2023年6月之后发布的“所以我们需要column=[发布时间] ”2023年6月之后发布的“所以我们需要column=[发布时间]
”周洁轮“所以我们需要column=[歌手名] ”周洁轮“所以我们需要column=[歌手名]
基于table和columns可能的cell values 是 = ['周洁轮', 2023, 6]。""", 基于table和columns可能的cell values 是 = ['周洁轮', 2023, 6]。""",
"schema_links":"""["歌曲名", "发布时间", "歌手名", "周洁轮", 2023, 6]""", "schema_links": """["歌曲名", "发布时间", "歌手名", "周洁轮", 2023, 6]""",
"sql":"""select 歌曲名 from 歌曲库 where YEAR(发布时间) >= 2023 and MONTH(发布时间) >= 6 and 歌手名 = '周洁轮' and 数据日期 = '2023-08-01' """ "sql": """select 歌曲名 from 歌曲库 where YEAR(发布时间) >= 2023 and MONTH(发布时间) >= 6 and 歌手名 = '周洁轮' and 数据日期 = '2023-08-01' """,
}, },
{ "current_date":"2023-08-01", {
"table_name":"歌曲库", "current_date": "2023-08-01",
"fields_list":"""["歌曲名", "歌曲版本", "歌手名", "歌曲类型", "发布时间", "MPPM歌曲ID", "是否严选窄口径歌曲", "是否严选宽口径歌曲", "是否潮流人歌曲", "超声波歌曲ID", "C音歌曲ID", "C音歌曲MID", "结算播放量", "运营播放量", "分享量", "收藏量", "运营搜播量", "结算搜播量", "拉新用户数", "拉活用户数", "分享率", "结算播放份额", "数据日期"]""", "table_name": "歌曲库",
"question":"邓梓琦在2023年1月5日之后发布的歌曲中有哪些播放量大于500W的", "fields_list": """["歌曲名", "歌曲版本", "歌手名", "歌曲类型", "发布时间", "MPPM歌曲ID", "是否严选窄口径歌曲", "是否严选宽口径歌曲", "是否潮流人歌曲", "超声波歌曲ID", "C音歌曲ID", "C音歌曲MID", "结算播放量", "运营播放量", "分享量", "收藏量", "运营搜播量", "结算搜播量", "拉新用户数", "拉活用户数", "分享率", "结算播放份额", "数据日期"]""",
"prior_schema_links":"""['2312311'->MPPM歌手ID]""", "question": "邓梓琦在2023年1月5日之后发布的歌曲中有哪些播放量大于500W的",
"prior_schema_links": """['2312311'->MPPM歌手ID]""",
"analysis": """让我们一步一步地思考。在问题“邓梓琦在2023年1月5日之后发布的歌曲中有哪些播放量大于500W的“中我们被问 "analysis": """让我们一步一步地思考。在问题“邓梓琦在2023年1月5日之后发布的歌曲中有哪些播放量大于500W的“中我们被问
“播放量大于500W的”所以我们需要column=[结算播放量] “播放量大于500W的”所以我们需要column=[结算播放量]
”邓梓琦在2023年1月5日之后发布的“所以我们需要column=[发布时间] ”邓梓琦在2023年1月5日之后发布的“所以我们需要column=[发布时间]
”邓梓琦“所以我们需要column=[歌手名] ”邓梓琦“所以我们需要column=[歌手名]
基于table和columns可能的cell values 是 = ['邓梓琦', 2023, 1, 5, 5000000]。""", 基于table和columns可能的cell values 是 = ['邓梓琦', 2023, 1, 5, 5000000]。""",
"schema_links":"""["结算播放量", "发布时间", "歌手名", "邓梓琦", 2023, 1, 5, 5000000]""", "schema_links": """["结算播放量", "发布时间", "歌手名", "邓梓琦", 2023, 1, 5, 5000000]""",
"sql":"""select 歌曲名 from 歌曲库 where YEAR(发布时间) >= 2023 and MONTH(发布时间) >= 1 and DAY(发布时间) >= 5 and 歌手名 = '邓梓琦' and 结算播放量 > 5000000 and 数据日期 = '2023-08-01'""" "sql": """select 歌曲名 from 歌曲库 where YEAR(发布时间) >= 2023 and MONTH(发布时间) >= 1 and DAY(发布时间) >= 5 and 歌手名 = '邓梓琦' and 结算播放量 > 5000000 and 数据日期 = '2023-08-01'""",
}, },
{ "current_date":"2023-09-17", {
"table_name":"歌曲库", "current_date": "2023-09-17",
"fields_list":"""["歌曲名", "歌曲版本", "歌手名", "歌曲类型", "发布时间", "MPPM歌曲ID", "是否严选窄口径歌曲", "是否严选宽口径歌曲", "是否潮流人歌曲", "超声波歌曲ID", "C音歌曲ID", "C音歌曲MID", "结算播放量", "运营播放量", "分享量", "收藏量", "运营搜播量", "结算搜播量", "拉新用户数", "拉活用户数", "分享率", "结算播放份额", "数据日期"]""", "table_name": "歌曲库",
"question":"2023年6月以后张亮英播放量大于200万的歌曲有哪些", "fields_list": """["歌曲名", "歌曲版本", "歌手名", "歌曲类型", "发布时间", "MPPM歌曲ID", "是否严选窄口径歌曲", "是否严选宽口径歌曲", "是否潮流人歌曲", "超声波歌曲ID", "C音歌曲ID", "C音歌曲MID", "结算播放量", "运营播放量", "分享量", "收藏量", "运营搜播量", "结算搜播量", "拉新用户数", "拉活用户数", "分享率", "结算播放份额", "数据日期"]""",
"prior_schema_links":"""['45453'->MPPM歌手ID]""", "question": "2023年6月以后张亮英播放量大于200万的歌曲有哪些",
"prior_schema_links": """['45453'->MPPM歌手ID]""",
"analysis": """让我们一步一步地思考。在问题“2023年6月以后张亮英播放量大于200万的歌曲有哪些“中我们被问 "analysis": """让我们一步一步地思考。在问题“2023年6月以后张亮英播放量大于200万的歌曲有哪些“中我们被问
“播放量大于200万的”所以我们需要column=[结算播放量] “播放量大于200万的”所以我们需要column=[结算播放量]
”2023年6月以后张亮英“所以我们需要column=[数据日期, 歌手名] ”2023年6月以后张亮英“所以我们需要column=[数据日期, 歌手名]
”歌曲有哪些“所以我们需要column=[歌曲名] ”歌曲有哪些“所以我们需要column=[歌曲名]
基于table和columns可能的cell values 是 = ['张亮英', 2023, 6, 2000000]。""", 基于table和columns可能的cell values 是 = ['张亮英', 2023, 6, 2000000]。""",
"schema_links":"""["结算播放量", "数据日期", "歌手名", "张亮英", 2023, 6, 2000000]""", "schema_links": """["结算播放量", "数据日期", "歌手名", "张亮英", 2023, 6, 2000000]""",
"sql":"""select 歌曲名 from 歌曲库 where YEAR(数据日期) >= 2023 and MONTH(数据日期) >= 6 and 歌手名 = '张亮英' and 结算播放量 > 2000000 """ "sql": """select 歌曲名 from 歌曲库 where YEAR(数据日期) >= 2023 and MONTH(数据日期) >= 6 and 歌手名 = '张亮英' and 结算播放量 > 2000000 """,
}, },
{ "current_date":"2023-08-16", {
"table_name":"歌曲库", "current_date": "2023-08-16",
"fields_list":"""["歌曲名", "歌曲版本", "歌手名", "歌曲类型", "发布时间", "MPPM歌曲ID", "是否严选窄口径歌曲", "是否严选宽口径歌曲", "是否潮流人歌曲", "超声波歌曲ID", "C音歌曲ID", "C音歌曲MID", "结算播放量", "运营播放量", "分享量", "收藏量", "运营搜播量", "结算搜播量", "拉新用户数", "拉活用户数", "分享率", "结算播放份额", "数据日期"]""", "table_name": "歌曲库",
"question":"2021年6月以后发布的李雨纯的播放量大于20万的歌曲有哪些", "fields_list": """["歌曲名", "歌曲版本", "歌手名", "歌曲类型", "发布时间", "MPPM歌曲ID", "是否严选窄口径歌曲", "是否严选宽口径歌曲", "是否潮流人歌曲", "超声波歌曲ID", "C音歌曲ID", "C音歌曲MID", "结算播放量", "运营播放量", "分享量", "收藏量", "运营搜播量", "结算搜播量", "拉新用户数", "拉活用户数", "分享率", "结算播放份额", "数据日期"]""",
"prior_schema_links":"""['23109'->MPPM歌手ID]""", "question": "2021年6月以后发布的李雨纯的播放量大于20万的歌曲有哪些",
"prior_schema_links": """['23109'->MPPM歌手ID]""",
"analysis": """让我们一步一步地思考。在问题“2021年6月以后发布的李雨纯的播放量大于20万的歌曲有哪些“中我们被问 "analysis": """让我们一步一步地思考。在问题“2021年6月以后发布的李雨纯的播放量大于20万的歌曲有哪些“中我们被问
“播放量大于20万的”所以我们需要column=[结算播放量] “播放量大于20万的”所以我们需要column=[结算播放量]
”2021年6月以后发布的“所以我们需要column=[发布时间] ”2021年6月以后发布的“所以我们需要column=[发布时间]
”李雨纯“所以我们需要column=[歌手名] ”李雨纯“所以我们需要column=[歌手名]
基于table和columns可能的cell values 是 = ['李雨纯', 2021, 6, 200000]。""", 基于table和columns可能的cell values 是 = ['李雨纯', 2021, 6, 200000]。""",
"schema_links":"""["结算播放量", "发布时间", "歌手名", "李雨纯", 2021, 6, 200000]""", "schema_links": """["结算播放量", "发布时间", "歌手名", "李雨纯", 2021, 6, 200000]""",
"sql":"""select 歌曲名 from 歌曲库 where YEAR(发布时间) >= 2021 and MONTH(发布时间) >= 6 and 歌手名 = '李雨纯' and 结算播放量 > 200000 and 数据日期 = '2023-08-16'""" "sql": """select 歌曲名 from 歌曲库 where YEAR(发布时间) >= 2021 and MONTH(发布时间) >= 6 and 歌手名 = '李雨纯' and 结算播放量 > 200000 and 数据日期 = '2023-08-16'""",
}, },
{ "current_date":"2023-08-16", {
"table_name":"歌曲库", "current_date": "2023-08-16",
"fields_list":"""["歌曲名", "歌曲版本", "歌手名", "歌曲类型", "发布时间", "MPPM歌曲ID", "是否严选窄口径歌曲", "是否严选宽口径歌曲", "是否潮流人歌曲", "超声波歌曲ID", "C音歌曲ID", "C音歌曲MID", "结算播放量", "运营播放量", "分享量", "收藏量", "运营搜播量", "结算搜播量", "拉新用户数", "拉活用户数", "分享率", "结算播放份额", "数据日期"]""", "table_name": "歌曲库",
"question":"刘锝桦在1992年4月2日到2020年5月2日之间发布的播放量大于20万的歌曲有哪些", "fields_list": """["歌曲名", "歌曲版本", "歌手名", "歌曲类型", "发布时间", "MPPM歌曲ID", "是否严选窄口径歌曲", "是否严选宽口径歌曲", "是否潮流人歌曲", "超声波歌曲ID", "C音歌曲ID", "C音歌曲MID", "结算播放量", "运营播放量", "分享量", "收藏量", "运营搜播量", "结算搜播量", "拉新用户数", "拉活用户数", "分享率", "结算播放份额", "数据日期"]""",
"prior_schema_links":"""['4234234'->MPPM歌手ID]""", "question": "刘锝桦在1992年4月2日到2020年5月2日之间发布的播放量大于20万的歌曲有哪些",
"prior_schema_links": """['4234234'->MPPM歌手ID]""",
"analysis": """让我们一步一步地思考。在问题“刘锝桦在1992年4月2日到2020年5月2日之间发布的播放量大于20万的歌曲有哪些“中我们被问 "analysis": """让我们一步一步地思考。在问题“刘锝桦在1992年4月2日到2020年5月2日之间发布的播放量大于20万的歌曲有哪些“中我们被问
“播放量大于20万的”所以我们需要column=[结算播放量] “播放量大于20万的”所以我们需要column=[结算播放量]
”1992年4月2日到2020年5月2日之间发布的“所以我们需要column=[发布时间] ”1992年4月2日到2020年5月2日之间发布的“所以我们需要column=[发布时间]
”刘锝桦“所以我们需要column=[歌手名] ”刘锝桦“所以我们需要column=[歌手名]
基于table和columns可能的cell values 是 = ['刘锝桦', 1992, 4, 2, 2020, 5, 2, 200000]。""", 基于table和columns可能的cell values 是 = ['刘锝桦', 1992, 4, 2, 2020, 5, 2, 200000]。""",
"schema_links":"""["结算播放量", "发布时间", "歌手名", "刘锝桦", 1992, 4, 2, 2020, 5, 2, 200000]""", "schema_links": """["结算播放量", "发布时间", "歌手名", "刘锝桦", 1992, 4, 2, 2020, 5, 2, 200000]""",
"sql":"""select 歌曲名 from 歌曲库 where YEAR(发布时间) >= 1992 and MONTH(发布时间) >= 4 and DAY(发布时间) >= 2 and YEAR(发布时间) <= 2020 and MONTH(发布时间) <= 5 and DAY(发布时间) <= 2 and 歌手名 = '刘锝桦' and 结算播放量 > 200000 and 数据日期 = '2023-08-16'""" "sql": """select 歌曲名 from 歌曲库 where YEAR(发布时间) >= 1992 and MONTH(发布时间) >= 4 and DAY(发布时间) >= 2 and YEAR(发布时间) <= 2020 and MONTH(发布时间) <= 5 and DAY(发布时间) <= 2 and 歌手名 = '刘锝桦' and 结算播放量 > 200000 and 数据日期 = '2023-08-16'""",
}, },
{ {
"current_date":"2023-09-04", "current_date": "2023-09-04",
"table_name":"内容库产品", "table_name": "内容库产品",
"fields_list":"""["用户名", "部门", "模块", "访问时长", "访问次数", "访问人数", "数据日期"]""", "fields_list": """["用户名", "部门", "模块", "访问时长", "访问次数", "访问人数", "数据日期"]""",
"question":"内容库近30天访问次数的平均数", "question": "内容库近30天访问次数的平均数",
"prior_schema_links":"""[]""", "prior_schema_links": """[]""",
"analysis": """让我们一步一步地思考。在问题“内容库近30天访问次数的平均数“中我们被问 "analysis": """让我们一步一步地思考。在问题“内容库近30天访问次数的平均数“中我们被问
“访问次数的平均数”所以我们需要column=[访问次数] “访问次数的平均数”所以我们需要column=[访问次数]
”内容库近30天“所以我们需要column=[数据日期] ”内容库近30天“所以我们需要column=[数据日期]
基于table和columns可能的cell values 是 = [30]。""", 基于table和columns可能的cell values 是 = [30]。""",
"schema_links":"""["访问次数", "数据日期", 30]""", "schema_links": """["访问次数", "数据日期", 30]""",
"sql":"""select avg(访问次数) from 内容库产品 where datediff('day', 数据日期, '2023-09-04') <= 30 """ "sql": """select avg(访问次数) from 内容库产品 where datediff('day', 数据日期, '2023-09-04') <= 30 """,
}, },
{ {
"current_date":"2023-09-04", "current_date": "2023-09-04",
"table_name":"内容库产品", "table_name": "内容库产品",
"fields_list":"""["用户名", "部门", "模块", "访问时长", "访问次数", "访问人数", "数据日期"]""", "fields_list": """["用户名", "部门", "模块", "访问时长", "访问次数", "访问人数", "数据日期"]""",
"question":"内容库近半年哪个月的访问次数汇总最高", "question": "内容库近半年哪个月的访问次数汇总最高",
"prior_schema_links":"""[]""", "prior_schema_links": """[]""",
"analysis": """让我们一步一步地思考。在问题“内容库近半年哪个月的访问次数汇总最高“中,我们被问: "analysis": """让我们一步一步地思考。在问题“内容库近半年哪个月的访问次数汇总最高“中,我们被问:
“访问次数汇总最高”所以我们需要column=[访问次数] “访问次数汇总最高”所以我们需要column=[访问次数]
”内容库近半年“所以我们需要column=[数据日期] ”内容库近半年“所以我们需要column=[数据日期]
基于table和columns可能的cell values 是 = [0.5]。""", 基于table和columns可能的cell values 是 = [0.5]。""",
"schema_links":"""["访问次数", "数据日期", 0.5]""", "schema_links": """["访问次数", "数据日期", 0.5]""",
"sql":"""select MONTH(数据日期), sum(访问次数) from 内容库产品 where datediff('year', 数据日期, '2023-09-04') <= 0.5 group by MONTH(数据日期) order by sum(访问次数) desc limit 1 """ "sql": """select MONTH(数据日期), sum(访问次数) from 内容库产品 where datediff('year', 数据日期, '2023-09-04') <= 0.5 group by MONTH(数据日期) order by sum(访问次数) desc limit 1 """,
}, },
{ {
"current_date":"2023-09-04", "current_date": "2023-09-04",
"table_name":"内容库产品", "table_name": "内容库产品",
"fields_list":"""["用户名", "部门", "模块", "访问时长", "访问次数", "访问人数", "数据日期"]""", "fields_list": """["用户名", "部门", "模块", "访问时长", "访问次数", "访问人数", "数据日期"]""",
"question":"内容库近半年每个月的平均访问次数", "question": "内容库近半年每个月的平均访问次数",
"prior_schema_links":"""[]""", "prior_schema_links": """[]""",
"analysis": """让我们一步一步地思考。在问题“内容库近半年每个月的平均访问次数“中,我们被问: "analysis": """让我们一步一步地思考。在问题“内容库近半年每个月的平均访问次数“中,我们被问:
“每个月的平均访问次数”所以我们需要column=[访问次数] “每个月的平均访问次数”所以我们需要column=[访问次数]
”内容库近半年“所以我们需要column=[数据日期] ”内容库近半年“所以我们需要column=[数据日期]
基于table和columns可能的cell values 是 = [0.5]。""", 基于table和columns可能的cell values 是 = [0.5]。""",
"schema_links":"""["访问次数", "数据日期", 0.5]""", "schema_links": """["访问次数", "数据日期", 0.5]""",
"sql":"""select MONTH(数据日期), avg(访问次数) from 内容库产品 where datediff('year', 数据日期, '2023-09-04') <= 0.5 group by MONTH(数据日期) """ "sql": """select MONTH(数据日期), avg(访问次数) from 内容库产品 where datediff('year', 数据日期, '2023-09-04') <= 0.5 group by MONTH(数据日期) """,
}, },
{ {
"current_date":"2023-09-10", "current_date": "2023-09-10",
"table_name":"内容库产品", "table_name": "内容库产品",
"fields_list":"""["用户名", "部门", "模块", "访问时长", "访问次数", "访问人数", "数据日期"]""", "fields_list": """["用户名", "部门", "模块", "访问时长", "访问次数", "访问人数", "数据日期"]""",
"question":"内容库 按部门统计访问次数 top10 的部门", "question": "内容库 按部门统计访问次数 top10 的部门",
"prior_schema_links":"""[]""", "prior_schema_links": """[]""",
"analysis": """让我们一步一步地思考。在问题“内容库 按部门统计访问次数 top10 的部门“中,我们被问: "analysis": """让我们一步一步地思考。在问题“内容库 按部门统计访问次数 top10 的部门“中,我们被问:
“访问次数 top10 的部门”所以我们需要column=[访问次数] “访问次数 top10 的部门”所以我们需要column=[访问次数]
”内容库 按部门统计“所以我们需要column=[部门] ”内容库 按部门统计“所以我们需要column=[部门]
基于table和columns可能的cell values 是 = [10]。""", 基于table和columns可能的cell values 是 = [10]。""",
"schema_links":"""["访问次数", "部门", 10]""", "schema_links": """["访问次数", "部门", 10]""",
"sql":"""select 部门, sum(访问次数) from 内容库产品 group by 部门 order by sum(访问次数) desc limit 10 """ "sql": """select 部门, sum(访问次数) from 内容库产品 group by 部门 order by sum(访问次数) desc limit 10 """,
} },
] ]

View File

@@ -14,7 +14,7 @@ def construct_plugin_prompt(tool_config):
tool_name = tool_config["name"] tool_name = tool_config["name"]
tool_description = tool_config["description"] tool_description = tool_config["description"]
tool_examples = tool_config["examples"] tool_examples = tool_config["examples"]
prompt = "【工具名称】\n" + tool_name + "\n" prompt = "【工具名称】\n" + tool_name + "\n"
prompt += "【工具描述】\n" + tool_description + "\n" prompt += "【工具描述】\n" + tool_description + "\n"
@@ -23,6 +23,7 @@ def construct_plugin_prompt(tool_config):
prompt += example + "\n" prompt += example + "\n"
return prompt return prompt
def construct_plugin_pool_prompt(tool_config_list): def construct_plugin_pool_prompt(tool_config_list):
tool_explain_list = [] tool_explain_list = []
for tool_config in tool_config_list: for tool_config in tool_config_list:
@@ -35,15 +36,20 @@ def construct_plugin_pool_prompt(tool_config_list):
def construct_task_prompt(query_text, tool_explain_list_str): def construct_task_prompt(query_text, tool_explain_list_str):
instruction = """问题为:{query_text}\n请根据问题和工具的描述选择对应的工具完成任务。请注意只能选择1个工具。请一步一步地分析选择工具的原因(每个工具的【工具适用问题示例】是选择的重要参考依据)并给出最终选择输出格式为json,key为分析过程, ’选择工具‘""".format(query_text=query_text) instruction = """问题为:{query_text}\n请根据问题和工具的描述选择对应的工具完成任务。请注意只能选择1个工具。请一步一步地分析选择工具的原因(每个工具的【工具适用问题示例】是选择的重要参考依据)并给出最终选择输出格式为json,key为分析过程, ’选择工具‘""".format(
query_text=query_text
)
prompt = "工具选择如下:\n\n{tool_explain_list_str}\n\n【任务说明】\n{instruction}".format(
instruction=instruction, tool_explain_list_str=tool_explain_list_str
)
prompt = "工具选择如下:\n\n{tool_explain_list_str}\n\n【任务说明】\n{instruction}".format(instruction=instruction, tool_explain_list_str=tool_explain_list_str)
return prompt return prompt
def plugin_selection_output_parse(llm_output: str)-> Union[Mapping[str, str], None]:
def plugin_selection_output_parse(llm_output: str) -> Union[Mapping[str, str], None]:
try: try:
pattern = r'\{[^{}]+\}' pattern = r"\{[^{}]+\}"
find_result = re.findall(pattern, llm_output) find_result = re.findall(pattern, llm_output)
result = find_result[0].strip() result = find_result[0].strip()
@@ -52,20 +58,24 @@ def plugin_selection_output_parse(llm_output: str)-> Union[Mapping[str, str], No
result_dict = json.loads(result) result_dict = json.loads(result)
print("result_dict: ", result_dict) print("result_dict: ", result_dict)
key_mapping = { key_mapping = {"分析过程": "analysis", "选择工具": "toolSelection"}
"分析过程":"analysis",
"选择工具":"toolSelection"
}
converted_result_dict = {key_mapping[key]: value for key, value in result_dict.items() if key in key_mapping} converted_result_dict = {
key_mapping[key]: value
for key, value in result_dict.items()
if key in key_mapping
}
except Exception as e: except Exception as e:
print(e) print(e)
converted_result_dict = None converted_result_dict = None
return converted_result_dict return converted_result_dict
def plugins_config_format_convert(plugin_config_list: List[Mapping[str, Any]]) -> List[Mapping[str, Any]]:
def plugins_config_format_convert(
plugin_config_list: List[Mapping[str, Any]]
) -> List[Mapping[str, Any]]:
plugin_config_list_new = [] plugin_config_list_new = []
for plugin_config in plugin_config_list: for plugin_config in plugin_config_list:
plugin_config_new = dict() plugin_config_new = dict()
@@ -75,7 +85,9 @@ def plugins_config_format_convert(plugin_config_list: List[Mapping[str, Any]]) -
parameters = plugin_config["parameters"] parameters = plugin_config["parameters"]
examples_str = "\n".join(examples) examples_str = "\n".join(examples)
description_new = """{plugin_desc}\n\n例如能够处理如下问题:\n{examples_str}""".format(plugin_desc=description, examples_str=examples_str) description_new = """{plugin_desc}\n\n例如能够处理如下问题:\n{examples_str}""".format(
plugin_desc=description, examples_str=examples_str
)
plugin_config_new["name"] = name plugin_config_new["name"] = name
plugin_config_new["description"] = description_new plugin_config_new["description"] = description_new
@@ -84,4 +96,3 @@ def plugins_config_format_convert(plugin_config_list: List[Mapping[str, Any]]) -
plugin_config_list_new.append(plugin_config_new) plugin_config_list_new.append(plugin_config_new)
return plugin_config_list_new return plugin_config_list_new

View File

@@ -10,12 +10,19 @@ import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
sys.path.append(os.path.dirname(os.path.abspath(__file__))) sys.path.append(os.path.dirname(os.path.abspath(__file__)))
from plugin_call.prompt_construct import construct_plugin_pool_prompt, construct_task_prompt, plugin_selection_output_parse, plugins_config_format_convert from plugin_call.prompt_construct import (
construct_plugin_pool_prompt,
construct_task_prompt,
plugin_selection_output_parse,
plugins_config_format_convert,
)
from util.llm_instance import llm from util.llm_instance import llm
def plugin_selection_run(query_text: str, plugin_configs: List[Mapping[str, Any]])-> Union[Mapping[str, str], None]: def plugin_selection_run(
query_text: str, plugin_configs: List[Mapping[str, Any]]
) -> Union[Mapping[str, str], None]:
tools_prompt = construct_plugin_pool_prompt(plugin_configs) tools_prompt = construct_plugin_pool_prompt(plugin_configs)
task_prompt = construct_task_prompt(query_text, tools_prompt) task_prompt = construct_task_prompt(query_text, tools_prompt)
@@ -23,4 +30,3 @@ def plugin_selection_run(query_text: str, plugin_configs: List[Mapping[str, Any]
parsed_output = plugin_selection_output_parse(llm_output) parsed_output = plugin_selection_output_parse(llm_output)
return parsed_output return parsed_output

View File

@@ -11,7 +11,7 @@ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
sys.path.append(os.path.dirname(os.path.abspath(__file__))) sys.path.append(os.path.dirname(os.path.abspath(__file__)))
def get_ids(documents:List[str]) -> List[str]: def get_ids(documents: List[str]) -> List[str]:
ids = [] ids = []
for doc in documents: for doc in documents:
ids.append(str(uuid.uuid5(uuid.NAMESPACE_URL, doc))) ids.append(str(uuid.uuid5(uuid.NAMESPACE_URL, doc)))
@@ -19,25 +19,23 @@ def get_ids(documents:List[str]) -> List[str]:
return ids return ids
def add2preset_query_collection(collection:Collection, def add2preset_query_collection(
preset_queries:List[str], collection: Collection, preset_queries: List[str], preset_query_ids: List[str]
preset_query_ids:List[str] ) -> None:
) -> None:
collection.add(documents=preset_queries, collection.add(documents=preset_queries, ids=preset_query_ids)
ids=preset_query_ids)
def update_preset_query_collection(collection:Collection, def update_preset_query_collection(
preset_queries:List[str], collection: Collection, preset_queries: List[str], preset_query_ids: List[str]
preset_query_ids:List[str] ) -> None:
) -> None:
collection.update(documents=preset_queries,
ids=preset_query_ids)
def query2preset_query_collection(collection:Collection, query_texts:List[str], n_results:int=10): collection.update(documents=preset_queries, ids=preset_query_ids)
def query2preset_query_collection(
collection: Collection, query_texts: List[str], n_results: int = 10
):
collection_cnt = collection.count() collection_cnt = collection.count()
min_n_results = 10 min_n_results = 10
min_n_results = min(collection_cnt, min_n_results) min_n_results = min(collection_cnt, min_n_results)
@@ -56,12 +54,13 @@ def query2preset_query_collection(collection:Collection, query_texts:List[str],
return res return res
def parse_retrieval_preset_query(res:List[Mapping[str, Any]]):
parsed_res = [[] for _ in range(0, len(res['ids']))]
retrieval_ids = res['ids'] def parse_retrieval_preset_query(res: List[Mapping[str, Any]]):
retrieval_distances = res['distances'] parsed_res = [[] for _ in range(0, len(res["ids"]))]
retrieval_sentences = res['documents']
retrieval_ids = res["ids"]
retrieval_distances = res["distances"]
retrieval_sentences = res["documents"]
for query_idx in range(0, len(retrieval_ids)): for query_idx in range(0, len(retrieval_ids)):
id_ls = retrieval_ids[query_idx] id_ls = retrieval_ids[query_idx]
@@ -73,43 +72,41 @@ def parse_retrieval_preset_query(res:List[Mapping[str, Any]]):
distance = distance_ls[idx] distance = distance_ls[idx]
sentence = sentence_ls[idx] sentence = sentence_ls[idx]
parsed_res[query_idx].append({ parsed_res[query_idx].append(
'id': id, {"id": id, "distance": distance, "presetQuery": sentence}
'distance': distance, )
'presetQuery': sentence
})
return parsed_res return parsed_res
def preset_query_retrieval_format(query_list:List[str], retrieval_list:List[Mapping[str, Any]]):
def preset_query_retrieval_format(
query_list: List[str], retrieval_list: List[Mapping[str, Any]]
):
res = [] res = []
for query_idx in range(0, len(query_list)): for query_idx in range(0, len(query_list)):
query = query_list[query_idx] query = query_list[query_idx]
retrieval = retrieval_list[query_idx] retrieval = retrieval_list[query_idx]
res.append({ res.append({"query": query, "retrieval": retrieval})
'query': query,
'retrieval': retrieval
})
return res return res
def empty_preset_query_collection(collection:Collection) -> None:
def empty_preset_query_collection(collection: Collection) -> None:
collection.delete() collection.delete()
def delete_preset_query_by_ids(collection:Collection, preset_query_ids:List[str]) -> None:
def delete_preset_query_by_ids(
collection: Collection, preset_query_ids: List[str]
) -> None:
collection.delete(ids=preset_query_ids) collection.delete(ids=preset_query_ids)
def get_preset_query_by_ids(collection:Collection, preset_query_ids:List[str]):
def get_preset_query_by_ids(collection: Collection, preset_query_ids: List[str]):
res = collection.get(ids=preset_query_ids) res = collection.get(ids=preset_query_ids)
return res return res
def preset_query_collection_size(collection:Collection) -> int:
def preset_query_collection_size(collection: Collection) -> int:
return collection.count() return collection.count()

View File

@@ -13,34 +13,45 @@ from chromadb.api import Collection, Documents, Embeddings
from langchain.llms import OpenAI from langchain.llms import OpenAI
from preset_query_db import (get_ids, add2preset_query_collection, from preset_query_db import (
query2preset_query_collection, parse_retrieval_preset_query, get_ids,
preset_query_retrieval_format, empty_preset_query_collection, preset_query_collection_size) add2preset_query_collection,
query2preset_query_collection,
parse_retrieval_preset_query,
preset_query_retrieval_format,
empty_preset_query_collection,
preset_query_collection_size,
)
from util.text2vec import Text2VecEmbeddingFunction from util.text2vec import Text2VecEmbeddingFunction
from run_config import CHROMA_DB_PERSIST_PATH, PRESET_QUERY_COLLECTION_NAME from run_config import CHROMA_DB_PERSIST_PATH, PRESET_QUERY_COLLECTION_NAME
from util.chromadb_instance import client from util.chromadb_instance import client
emb_func = Text2VecEmbeddingFunction() emb_func = Text2VecEmbeddingFunction()
collection = client.get_or_create_collection(name=PRESET_QUERY_COLLECTION_NAME, collection = client.get_or_create_collection(
embedding_function=emb_func, name=PRESET_QUERY_COLLECTION_NAME,
metadata={"hnsw:space": "cosine"} embedding_function=emb_func,
) # Get a collection object from an existing collection, by name. If it doesn't exist, create it. metadata={"hnsw:space": "cosine"},
) # Get a collection object from an existing collection, by name. If it doesn't exist, create it.
print("init_preset_query_collection_size: ", preset_query_collection_size(collection)) print("init_preset_query_collection_size: ", preset_query_collection_size(collection))
def preset_query_retrieval_run(collection:Collection, query_texts_list:List[str], n_results:int=5): def preset_query_retrieval_run(
retrieval_res = query2preset_query_collection(collection=collection, collection: Collection, query_texts_list: List[str], n_results: int = 5
query_texts=query_texts_list, ):
n_results=n_results) retrieval_res = query2preset_query_collection(
collection=collection, query_texts=query_texts_list, n_results=n_results
)
parsed_retrieval_res = parse_retrieval_preset_query(retrieval_res) parsed_retrieval_res = parse_retrieval_preset_query(retrieval_res)
parsed_retrieval_res_format = preset_query_retrieval_format(query_texts_list, parsed_retrieval_res) parsed_retrieval_res_format = preset_query_retrieval_format(
query_texts_list, parsed_retrieval_res
)
print('parsed_retrieval_res_format: ', parsed_retrieval_res_format) print("parsed_retrieval_res_format: ", parsed_retrieval_res_format)
return parsed_retrieval_res_format return parsed_retrieval_res_format

View File

@@ -11,7 +11,7 @@ OPENAI_API_KEY = "YOUR_API_KEY"
TEMPERATURE = 0.0 TEMPERATURE = 0.0
CHROMA_DB_PERSIST_DIR = 'chm_db' CHROMA_DB_PERSIST_DIR = "chm_db"
PRESET_QUERY_COLLECTION_NAME = "preset_query_collection" PRESET_QUERY_COLLECTION_NAME = "preset_query_collection"
TEXT2DSL_COLLECTION_NAME = "text2dsl_collection" TEXT2DSL_COLLECTION_NAME = "text2dsl_collection"
TEXT2DSL_FEW_SHOTS_EXAMPLE_NUM = 15 TEXT2DSL_FEW_SHOTS_EXAMPLE_NUM = 15
@@ -21,9 +21,9 @@ CHROMA_DB_PERSIST_PATH = os.path.join(PROJECT_DIR_PATH, CHROMA_DB_PERSIST_DIR)
HF_TEXT2VEC_MODEL_NAME = "GanymedeNil/text2vec-large-chinese" HF_TEXT2VEC_MODEL_NAME = "GanymedeNil/text2vec-large-chinese"
if __name__ == '__main__': if __name__ == "__main__":
print('PROJECT_DIR_PATH: ', PROJECT_DIR_PATH) print("PROJECT_DIR_PATH: ", PROJECT_DIR_PATH)
print('EMB_MODEL_PATH: ', HF_TEXT2VEC_MODEL_NAME) print("EMB_MODEL_PATH: ", HF_TEXT2VEC_MODEL_NAME)
print('CHROMA_DB_PERSIST_PATH: ', CHROMA_DB_PERSIST_PATH) print("CHROMA_DB_PERSIST_PATH: ", CHROMA_DB_PERSIST_PATH)
print('LLMPARSER_HOST: ', LLMPARSER_HOST) print("LLMPARSER_HOST: ", LLMPARSER_HOST)
print('LLMPARSER_PORT: ', LLMPARSER_PORT) print("LLMPARSER_PORT: ", LLMPARSER_PORT)

View File

@@ -22,20 +22,34 @@ from util.text2vec import Text2VecEmbeddingFunction, hg_embedding
from util.chromadb_instance import client as chromadb_client, empty_chroma_collection_2 from util.chromadb_instance import client as chromadb_client, empty_chroma_collection_2
from run_config import TEXT2DSL_COLLECTION_NAME, TEXT2DSL_FEW_SHOTS_EXAMPLE_NUM from run_config import TEXT2DSL_COLLECTION_NAME, TEXT2DSL_FEW_SHOTS_EXAMPLE_NUM
def reload_sql_example_collection(vectorstore:Chroma,
sql_examplars:List[Mapping[str, str]], def reload_sql_example_collection(
sql_example_selector:SemanticSimilarityExampleSelector, vectorstore: Chroma,
example_nums:int sql_examplars: List[Mapping[str, str]],
): sql_example_selector: SemanticSimilarityExampleSelector,
example_nums: int,
):
print("original sql_examples_collection size:", vectorstore._collection.count()) print("original sql_examples_collection size:", vectorstore._collection.count())
new_collection = empty_chroma_collection_2(collection=vectorstore._collection) new_collection = empty_chroma_collection_2(collection=vectorstore._collection)
vectorstore._collection = new_collection vectorstore._collection = new_collection
print("emptied sql_examples_collection size:", vectorstore._collection.count()) print("emptied sql_examples_collection size:", vectorstore._collection.count())
sql_example_selector = SemanticSimilarityExampleSelector(vectorstore=sql_examples_vectorstore, k=example_nums, sql_example_selector = SemanticSimilarityExampleSelector(
input_keys=["question"], vectorstore=sql_examples_vectorstore,
example_keys=["table_name", "fields_list", "prior_schema_links", "question", "analysis", "schema_links", "current_date", "sql"]) k=example_nums,
input_keys=["question"],
example_keys=[
"table_name",
"fields_list",
"prior_schema_links",
"question",
"analysis",
"schema_links",
"current_date",
"sql",
],
)
for example in sql_examplars: for example in sql_examplars:
sql_example_selector.add_example(example) sql_example_selector.add_example(example)
@@ -45,20 +59,36 @@ def reload_sql_example_collection(vectorstore:Chroma,
return vectorstore, sql_example_selector return vectorstore, sql_example_selector
sql_examples_vectorstore = Chroma(collection_name=TEXT2DSL_COLLECTION_NAME, sql_examples_vectorstore = Chroma(
embedding_function=hg_embedding, collection_name=TEXT2DSL_COLLECTION_NAME,
client=chromadb_client) embedding_function=hg_embedding,
client=chromadb_client,
)
example_nums = TEXT2DSL_FEW_SHOTS_EXAMPLE_NUM example_nums = TEXT2DSL_FEW_SHOTS_EXAMPLE_NUM
sql_example_selector = SemanticSimilarityExampleSelector(vectorstore=sql_examples_vectorstore, k=example_nums, sql_example_selector = SemanticSimilarityExampleSelector(
input_keys=["question"], vectorstore=sql_examples_vectorstore,
example_keys=["table_name", "fields_list", "prior_schema_links", "question", "analysis", "schema_links", "current_date", "sql"]) k=example_nums,
input_keys=["question"],
example_keys=[
"table_name",
"fields_list",
"prior_schema_links",
"question",
"analysis",
"schema_links",
"current_date",
"sql",
],
)
if sql_examples_vectorstore._collection.count() > 0: if sql_examples_vectorstore._collection.count() > 0:
print("examples already in sql_vectorstore") print("examples already in sql_vectorstore")
print("init sql_vectorstore size:", sql_examples_vectorstore._collection.count()) print("init sql_vectorstore size:", sql_examples_vectorstore._collection.count())
print("sql_examplars size:", len(sql_examplars)) print("sql_examplars size:", len(sql_examplars))
sql_examples_vectorstore, sql_example_selector = reload_sql_example_collection(sql_examples_vectorstore, sql_examplars, sql_example_selector, example_nums) sql_examples_vectorstore, sql_example_selector = reload_sql_example_collection(
sql_examples_vectorstore, sql_examplars, sql_example_selector, example_nums
)
print("added sql_vectorstore size:", sql_examples_vectorstore._collection.count()) print("added sql_vectorstore size:", sql_examples_vectorstore._collection.count())

View File

@@ -13,17 +13,31 @@ from few_shot_example.sql_exampler import examplars as sql_examplars
from run_config import LLMPARSER_HOST, LLMPARSER_PORT from run_config import LLMPARSER_HOST, LLMPARSER_PORT
def text2dsl_setting_update(llm_parser_host:str, llm_parser_port:str, def text2dsl_setting_update(
sql_examplars:List[Mapping[str, str]], example_nums:int, is_shortcut:bool): llm_parser_host: str,
llm_parser_port: str,
sql_examplars: List[Mapping[str, str]],
example_nums: int,
is_shortcut: bool,
):
url = f"http://{llm_parser_host}:{llm_parser_port}/query2sql_setting_update/" url = f"http://{llm_parser_host}:{llm_parser_port}/query2sql_setting_update/"
print("url: ", url) print("url: ", url)
payload = {"sqlExamplars":sql_examplars, "exampleNums":example_nums, "isShortcut":is_shortcut} payload = {
headers = {'content-type': 'application/json'} "sqlExamplars": sql_examplars,
"exampleNums": example_nums,
"isShortcut": is_shortcut,
}
headers = {"content-type": "application/json"}
response = requests.post(url, data=json.dumps(payload), headers=headers) response = requests.post(url, data=json.dumps(payload), headers=headers)
print(response.text) print(response.text)
if __name__ == "__main__": if __name__ == "__main__":
text2dsl_setting_update(LLMPARSER_HOST, LLMPARSER_PORT, text2dsl_setting_update(
sql_examplars, TEXT2DSL_FEW_SHOTS_EXAMPLE_NUM, TEXT2DSL_IS_SHORTCUT) LLMPARSER_HOST,
LLMPARSER_PORT,
sql_examplars,
TEXT2DSL_FEW_SHOTS_EXAMPLE_NUM,
TEXT2DSL_IS_SHORTCUT,
)

View File

@@ -1,21 +1,25 @@
# -*- coding:utf-8 -*- # -*- coding:utf-8 -*-
import re import re
def schema_link_parse(schema_link_output): def schema_link_parse(schema_link_output):
try: try:
schema_link_output = schema_link_output.strip() schema_link_output = schema_link_output.strip()
pattern = r'Schema_links:(.*)' pattern = r"Schema_links:(.*)"
schema_link_output = re.findall(pattern, schema_link_output, re.DOTALL)[0].strip() schema_link_output = re.findall(pattern, schema_link_output, re.DOTALL)[
0
].strip()
except Exception as e: except Exception as e:
print(e) print(e)
schema_link_output = None schema_link_output = None
return schema_link_output return schema_link_output
def combo_schema_link_parse(schema_linking_sql_combo_output: str): def combo_schema_link_parse(schema_linking_sql_combo_output: str):
try: try:
schema_linking_sql_combo_output = schema_linking_sql_combo_output.strip() schema_linking_sql_combo_output = schema_linking_sql_combo_output.strip()
pattern = r'Schema_links:(\[.*?\])' pattern = r"Schema_links:(\[.*?\])"
schema_links_match = re.search(pattern, schema_linking_sql_combo_output) schema_links_match = re.search(pattern, schema_linking_sql_combo_output)
if schema_links_match: if schema_links_match:
@@ -28,10 +32,11 @@ def combo_schema_link_parse(schema_linking_sql_combo_output: str):
return schema_links return schema_links
def combo_sql_parse(schema_linking_sql_combo_output: str): def combo_sql_parse(schema_linking_sql_combo_output: str):
try: try:
schema_linking_sql_combo_output = schema_linking_sql_combo_output.strip() schema_linking_sql_combo_output = schema_linking_sql_combo_output.strip()
pattern = r'SQL:(.*)' pattern = r"SQL:(.*)"
sql_match = re.search(pattern, schema_linking_sql_combo_output) sql_match = re.search(pattern, schema_linking_sql_combo_output)
if sql_match: if sql_match:

View File

@@ -11,17 +11,31 @@ from langchain.prompts.few_shot import FewShotPromptTemplate
from langchain.prompts.example_selector import SemanticSimilarityExampleSelector from langchain.prompts.example_selector import SemanticSimilarityExampleSelector
def schema_linking_exampler(user_query: str, def schema_linking_exampler(
domain_name: str, user_query: str,
fields_list: List[str], domain_name: str,
prior_schema_links: Mapping[str,str], fields_list: List[str],
example_selector: SemanticSimilarityExampleSelector, prior_schema_links: Mapping[str, str],
) -> str: example_selector: SemanticSimilarityExampleSelector,
) -> str:
prior_schema_links_str = '['+ ','.join(["""'{}'->{}""".format(k,v) for k,v in prior_schema_links.items()]) + ']' prior_schema_links_str = (
"["
+ ",".join(["""'{}'->{}""".format(k, v) for k, v in prior_schema_links.items()])
+ "]"
)
example_prompt_template = PromptTemplate(input_variables=["table_name", "fields_list", "prior_schema_links", "question", "analysis", "schema_links"], example_prompt_template = PromptTemplate(
template="Table {table_name}, columns = {fields_list}, prior_schema_links = {prior_schema_links}\n问题:{question}\n分析:{analysis} 所以Schema_links是:\nSchema_links:{schema_links}") input_variables=[
"table_name",
"fields_list",
"prior_schema_links",
"question",
"analysis",
"schema_links",
],
template="Table {table_name}, columns = {fields_list}, prior_schema_links = {prior_schema_links}\n问题:{question}\n分析:{analysis} 所以Schema_links是:\nSchema_links:{schema_links}",
)
instruction = "# 根据数据库的表结构,参考先验信息,找出为每个问题生成SQL查询语句的schema_links" instruction = "# 根据数据库的表结构,参考先验信息,找出为每个问题生成SQL查询语句的schema_links"
@@ -30,81 +44,121 @@ def schema_linking_exampler(user_query: str,
schema_linking_example_prompt_template = FewShotPromptTemplate( schema_linking_example_prompt_template = FewShotPromptTemplate(
example_selector=example_selector, example_selector=example_selector,
example_prompt=example_prompt_template, example_prompt=example_prompt_template,
example_separator="\n\n", example_separator="\n\n",
prefix=instruction, prefix=instruction,
input_variables=["table_name", "fields_list", "prior_schema_links", "question"], input_variables=["table_name", "fields_list", "prior_schema_links", "question"],
suffix=schema_linking_prompt suffix=schema_linking_prompt,
) )
schema_linking_example_prompt = schema_linking_example_prompt_template.format(table_name=domain_name, schema_linking_example_prompt = schema_linking_example_prompt_template.format(
fields_list=fields_list, table_name=domain_name,
prior_schema_links=prior_schema_links_str, fields_list=fields_list,
question=user_query) prior_schema_links=prior_schema_links_str,
question=user_query,
)
return schema_linking_example_prompt return schema_linking_example_prompt
def sql_exampler(user_query: str, def sql_exampler(
domain_name: str, user_query: str,
schema_link_str: str, domain_name: str,
data_date: str, schema_link_str: str,
example_selector: SemanticSimilarityExampleSelector, data_date: str,
) -> str: example_selector: SemanticSimilarityExampleSelector,
) -> str:
instruction = "# 根据schema_links为每个问题生成SQL查询语句" instruction = "# 根据schema_links为每个问题生成SQL查询语句"
sql_example_prompt_template = PromptTemplate(input_variables=["question", "current_date", "table_name", "schema_links", "sql"], sql_example_prompt_template = PromptTemplate(
template="问题:{question}\nCurrent_date:{current_date}\nTable {table_name}\nSchema_links:{schema_links}\nSQL:{sql}") input_variables=[
"question",
"current_date",
"table_name",
"schema_links",
"sql",
],
template="问题:{question}\nCurrent_date:{current_date}\nTable {table_name}\nSchema_links:{schema_links}\nSQL:{sql}",
)
sql_prompt = "问题:{question}\nCurrent_date:{current_date}\nTable {table_name}\nSchema_links:{schema_links}\nSQL:" sql_prompt = "问题:{question}\nCurrent_date:{current_date}\nTable {table_name}\nSchema_links:{schema_links}\nSQL:"
sql_example_prompt_template = FewShotPromptTemplate( sql_example_prompt_template = FewShotPromptTemplate(
example_selector=example_selector, example_selector=example_selector,
example_prompt=sql_example_prompt_template, example_prompt=sql_example_prompt_template,
example_separator="\n\n", example_separator="\n\n",
prefix=instruction, prefix=instruction,
input_variables=["question", "current_date", "table_name", "schema_links"], input_variables=["question", "current_date", "table_name", "schema_links"],
suffix=sql_prompt suffix=sql_prompt,
) )
sql_example_prompt = sql_example_prompt_template.format(question=user_query, sql_example_prompt = sql_example_prompt_template.format(
current_date=data_date, question=user_query,
table_name=domain_name, current_date=data_date,
schema_links=schema_link_str) table_name=domain_name,
schema_links=schema_link_str,
)
return sql_example_prompt return sql_example_prompt
def schema_linking_sql_combo_examplar(user_query: str, def schema_linking_sql_combo_examplar(
domain_name: str, user_query: str,
data_date : str, domain_name: str,
fields_list: List[str], data_date: str,
prior_schema_links: Mapping[str,str], fields_list: List[str],
example_selector: SemanticSimilarityExampleSelector) -> str: prior_schema_links: Mapping[str, str],
example_selector: SemanticSimilarityExampleSelector,
prior_schema_links_str = '['+ ','.join(["""'{}'->{}""".format(k,v) for k,v in prior_schema_links.items()]) + ']' ) -> str:
example_prompt_template = PromptTemplate(input_variables=["table_name", "fields_list", "prior_schema_links", "current_date", "question", "analysis", "schema_links", "sql"], prior_schema_links_str = (
template="Table {table_name}, columns = {fields_list}, prior_schema_links = {prior_schema_links}\nCurrent_date:{current_date}\n问题:{question}\n分析:{analysis} 所以Schema_links是:\nSchema_links:{schema_links}\nSQL:{sql}") "["
+ ",".join(["""'{}'->{}""".format(k, v) for k, v in prior_schema_links.items()])
+ "]"
)
instruction = "# 根据数据库的表结构,参考先验信息,找出为每个问题生成SQL查询语句的schema_links,再根据schema_links为每个问题生成SQL查询语句" example_prompt_template = PromptTemplate(
input_variables=[
"table_name",
"fields_list",
"prior_schema_links",
"current_date",
"question",
"analysis",
"schema_links",
"sql",
],
template="Table {table_name}, columns = {fields_list}, prior_schema_links = {prior_schema_links}\nCurrent_date:{current_date}\n问题:{question}\n分析:{analysis} 所以Schema_links是:\nSchema_links:{schema_links}\nSQL:{sql}",
)
instruction = (
"# 根据数据库的表结构,参考先验信息,找出为每个问题生成SQL查询语句的schema_links,再根据schema_links为每个问题生成SQL查询语句"
)
schema_linking_sql_combo_prompt = "Table {table_name}, columns = {fields_list}, prior_schema_links = {prior_schema_links}\nCurrent_date:{current_date}\n问题:{question}\n分析: 让我们一步一步地思考。" schema_linking_sql_combo_prompt = "Table {table_name}, columns = {fields_list}, prior_schema_links = {prior_schema_links}\nCurrent_date:{current_date}\n问题:{question}\n分析: 让我们一步一步地思考。"
schema_linking_sql_combo_example_prompt_template = FewShotPromptTemplate( schema_linking_sql_combo_example_prompt_template = FewShotPromptTemplate(
example_selector=example_selector, example_selector=example_selector,
example_prompt=example_prompt_template, example_prompt=example_prompt_template,
example_separator="\n\n", example_separator="\n\n",
prefix=instruction, prefix=instruction,
input_variables=["table_name", "fields_list", "prior_schema_links", "current_date", "question"], input_variables=[
suffix=schema_linking_sql_combo_prompt "table_name",
"fields_list",
"prior_schema_links",
"current_date",
"question",
],
suffix=schema_linking_sql_combo_prompt,
)
schema_linking_sql_combo_example_prompt = (
schema_linking_sql_combo_example_prompt_template.format(
table_name=domain_name,
fields_list=fields_list,
prior_schema_links=prior_schema_links_str,
current_date=data_date,
question=user_query,
) )
)
schema_linking_sql_combo_example_prompt = schema_linking_sql_combo_example_prompt_template.format(table_name=domain_name,
fields_list=fields_list,
prior_schema_links=prior_schema_links_str,
current_date=data_date,
question=user_query)
return schema_linking_sql_combo_example_prompt return schema_linking_sql_combo_example_prompt

View File

@@ -7,133 +7,182 @@ import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
sys.path.append(os.path.dirname(os.path.abspath(__file__))) sys.path.append(os.path.dirname(os.path.abspath(__file__)))
from sql.prompt_maker import schema_linking_exampler, sql_exampler, schema_linking_sql_combo_examplar from sql.prompt_maker import (
from sql.constructor import sql_examples_vectorstore, sql_example_selector, reload_sql_example_collection schema_linking_exampler,
from sql.output_parser import schema_link_parse, combo_schema_link_parse, combo_sql_parse sql_exampler,
schema_linking_sql_combo_examplar,
)
from sql.constructor import (
sql_examples_vectorstore,
sql_example_selector,
reload_sql_example_collection,
)
from sql.output_parser import (
schema_link_parse,
combo_schema_link_parse,
combo_sql_parse,
)
from util.llm_instance import llm from util.llm_instance import llm
from run_config import TEXT2DSL_IS_SHORTCUT from run_config import TEXT2DSL_IS_SHORTCUT
class Text2DSLAgent(object): class Text2DSLAgent(object):
def __init__(self): def __init__(self):
self.schema_linking_exampler = schema_linking_exampler self.schema_linking_exampler = schema_linking_exampler
self.sql_exampler = sql_exampler self.sql_exampler = sql_exampler
self.schema_linking_sql_combo_exampler = schema_linking_sql_combo_examplar self.schema_linking_sql_combo_exampler = schema_linking_sql_combo_examplar
self.sql_examples_vectorstore = sql_examples_vectorstore self.sql_examples_vectorstore = sql_examples_vectorstore
self.sql_example_selector = sql_example_selector self.sql_example_selector = sql_example_selector
self.schema_link_parse = schema_link_parse self.schema_link_parse = schema_link_parse
self.combo_schema_link_parse = combo_schema_link_parse self.combo_schema_link_parse = combo_schema_link_parse
self.combo_sql_parse = combo_sql_parse self.combo_sql_parse = combo_sql_parse
self.llm = llm self.llm = llm
self.is_shortcut = TEXT2DSL_IS_SHORTCUT self.is_shortcut = TEXT2DSL_IS_SHORTCUT
def update_examples(self, sql_examples, example_nums, is_shortcut): def update_examples(self, sql_examples, example_nums, is_shortcut):
self.sql_examples_vectorstore, self.sql_example_selector = reload_sql_example_collection(self.sql_examples_vectorstore, (
sql_examples, self.sql_examples_vectorstore,
self.sql_example_selector, self.sql_example_selector,
example_nums) ) = reload_sql_example_collection(
self.is_shortcut = is_shortcut self.sql_examples_vectorstore,
sql_examples,
self.sql_example_selector,
example_nums,
)
self.is_shortcut = is_shortcut
def query2sql(self, query_text: str, def query2sql(
schema : Union[dict, None] = None, self,
current_date: str = None, query_text: str,
linking: Union[List[Mapping[str, str]], None] = None schema: Union[dict, None] = None,
): current_date: str = None,
linking: Union[List[Mapping[str, str]], None] = None,
):
print("query_text: ", query_text) print("query_text: ", query_text)
print("schema: ", schema) print("schema: ", schema)
print("current_date: ", current_date) print("current_date: ", current_date)
print("prior_schema_links: ", linking) print("prior_schema_links: ", linking)
if linking is not None: if linking is not None:
prior_schema_links = {item['fieldValue']:item['fieldName'] for item in linking} prior_schema_links = {
else: item["fieldValue"]: item["fieldName"] for item in linking
prior_schema_links = {} }
else:
prior_schema_links = {}
model_name = schema['modelName'] model_name = schema["modelName"]
fields_list = schema['fieldNameList'] fields_list = schema["fieldNameList"]
schema_linking_prompt = self.schema_linking_exampler(query_text, model_name, fields_list, prior_schema_links, self.sql_example_selector) schema_linking_prompt = self.schema_linking_exampler(
print("schema_linking_prompt->", schema_linking_prompt) query_text,
schema_link_output = self.llm(schema_linking_prompt) model_name,
schema_link_str = self.schema_link_parse(schema_link_output) fields_list,
prior_schema_links,
sql_prompt = self.sql_exampler(query_text, model_name, schema_link_str, current_date, self.sql_example_selector) self.sql_example_selector,
print("sql_prompt->", sql_prompt) )
sql_output = self.llm(sql_prompt) print("schema_linking_prompt->", schema_linking_prompt)
schema_link_output = self.llm(schema_linking_prompt)
schema_link_str = self.schema_link_parse(schema_link_output)
resp = dict() sql_prompt = self.sql_exampler(
resp['query'] = query_text query_text,
resp['model'] = model_name model_name,
resp['fields'] = fields_list schema_link_str,
resp['priorSchemaLinking'] = linking current_date,
resp['dataDate'] = current_date self.sql_example_selector,
)
print("sql_prompt->", sql_prompt)
sql_output = self.llm(sql_prompt)
resp['analysisOutput'] = schema_link_output resp = dict()
resp['schemaLinkStr'] = schema_link_str resp["query"] = query_text
resp["model"] = model_name
resp['sqlOutput'] = sql_output resp["fields"] = fields_list
resp["priorSchemaLinking"] = linking
resp["dataDate"] = current_date
print("resp: ", resp) resp["analysisOutput"] = schema_link_output
resp["schemaLinkStr"] = schema_link_str
return resp resp["sqlOutput"] = sql_output
def query2sqlcombo(self, query_text: str, print("resp: ", resp)
schema : Union[dict, None] = None,
current_date: str = None,
linking: Union[List[Mapping[str, str]], None] = None
):
print("query_text: ", query_text) return resp
print("schema: ", schema)
print("current_date: ", current_date)
print("prior_schema_links: ", linking)
if linking is not None: def query2sqlcombo(
prior_schema_links = {item['fieldValue']:item['fieldName'] for item in linking} self,
else: query_text: str,
prior_schema_links = {} schema: Union[dict, None] = None,
current_date: str = None,
linking: Union[List[Mapping[str, str]], None] = None,
):
model_name = schema['modelName'] print("query_text: ", query_text)
fields_list = schema['fieldNameList'] print("schema: ", schema)
print("current_date: ", current_date)
print("prior_schema_links: ", linking)
schema_linking_sql_combo_prompt = self.schema_linking_sql_combo_exampler(query_text, model_name, current_date, fields_list, if linking is not None:
prior_schema_links, self.sql_example_selector) prior_schema_links = {
print("schema_linking_sql_combo_prompt->", schema_linking_sql_combo_prompt) item["fieldValue"]: item["fieldName"] for item in linking
schema_linking_sql_combo_output = self.llm(schema_linking_sql_combo_prompt) }
else:
prior_schema_links = {}
schema_linking_str = self.combo_schema_link_parse(schema_linking_sql_combo_output) model_name = schema["modelName"]
sql_str = self.combo_sql_parse(schema_linking_sql_combo_output) fields_list = schema["fieldNameList"]
resp = dict() schema_linking_sql_combo_prompt = self.schema_linking_sql_combo_exampler(
resp['query'] = query_text query_text,
resp['model'] = model_name model_name,
resp['fields'] = fields_list current_date,
resp['priorSchemaLinking'] = prior_schema_links fields_list,
resp['dataDate'] = current_date prior_schema_links,
self.sql_example_selector,
)
print("schema_linking_sql_combo_prompt->", schema_linking_sql_combo_prompt)
schema_linking_sql_combo_output = self.llm(schema_linking_sql_combo_prompt)
resp['analysisOutput'] = schema_linking_sql_combo_output schema_linking_str = self.combo_schema_link_parse(
resp['schemaLinkStr'] = schema_linking_str schema_linking_sql_combo_output
resp['sqlOutput'] = sql_str )
sql_str = self.combo_sql_parse(schema_linking_sql_combo_output)
print("resp: ", resp) resp = dict()
resp["query"] = query_text
resp["model"] = model_name
resp["fields"] = fields_list
resp["priorSchemaLinking"] = prior_schema_links
resp["dataDate"] = current_date
return resp resp["analysisOutput"] = schema_linking_sql_combo_output
resp["schemaLinkStr"] = schema_linking_str
resp["sqlOutput"] = sql_str
def query2sql_run(self, query_text: str, print("resp: ", resp)
schema : Union[dict, None] = None,
current_date: str = None, return resp
linking: Union[List[Mapping[str, str]], None] = None):
def query2sql_run(
self,
query_text: str,
schema: Union[dict, None] = None,
current_date: str = None,
linking: Union[List[Mapping[str, str]], None] = None,
):
if self.is_shortcut:
return self.query2sqlcombo(query_text, schema, current_date, linking)
else:
return self.query2sql(query_text, schema, current_date, linking)
if self.is_shortcut:
return self.query2sqlcombo(query_text, schema, current_date, linking)
else:
return self.query2sql(query_text, schema, current_date, linking)
text2sql_agent = Text2DSLAgent() text2sql_agent = Text2DSLAgent()

View File

@@ -13,11 +13,19 @@ from fastapi import FastAPI, HTTPException
from sql.run import text2sql_agent from sql.run import text2sql_agent
from preset_retrieval.run import preset_query_retrieval_run, collection as preset_query_collection from preset_retrieval.run import (
from preset_retrieval.preset_query_db import (add2preset_query_collection, update_preset_query_collection, preset_query_retrieval_run,
empty_preset_query_collection, delete_preset_query_by_ids, collection as preset_query_collection,
update_preset_query_collection, get_preset_query_by_ids, )
preset_query_collection_size) from preset_retrieval.preset_query_db import (
add2preset_query_collection,
update_preset_query_collection,
empty_preset_query_collection,
delete_preset_query_by_ids,
update_preset_query_collection,
get_preset_query_by_ids,
preset_query_collection_size,
)
from plugin_call.run import plugin_selection_run from plugin_call.run import plugin_selection_run
@@ -27,62 +35,64 @@ from run_config import LLMPARSER_PORT
app = FastAPI() app = FastAPI()
@app.post("/query2sql/") @app.post("/query2sql/")
async def din_query2sql(query_body: Mapping[str, Any]): async def din_query2sql(query_body: Mapping[str, Any]):
if 'queryText' not in query_body: if "queryText" not in query_body:
raise HTTPException(status_code=400, raise HTTPException(status_code=400, detail="query_text is not in query_body")
detail="query_text is not in query_body")
else: else:
query_text = query_body['queryText'] query_text = query_body["queryText"]
if 'schema' not in query_body: if "schema" not in query_body:
raise HTTPException(status_code=400, detail="schema is not in query_body") raise HTTPException(status_code=400, detail="schema is not in query_body")
else: else:
schema = query_body['schema'] schema = query_body["schema"]
if 'currentDate' not in query_body: if "currentDate" not in query_body:
raise HTTPException(status_code=400, detail="currentDate is not in query_body") raise HTTPException(status_code=400, detail="currentDate is not in query_body")
else: else:
current_date = query_body['currentDate'] current_date = query_body["currentDate"]
if 'linking' not in query_body: if "linking" not in query_body:
linking = None linking = None
else: else:
linking = query_body['linking'] linking = query_body["linking"]
resp = text2sql_agent.query2sql_run(query_text=query_text, resp = text2sql_agent.query2sql_run(
schema=schema, current_date=current_date, linking=linking) query_text=query_text, schema=schema, current_date=current_date, linking=linking
)
return resp return resp
@app.post("/query2sql_setting_update/") @app.post("/query2sql_setting_update/")
async def query2sql_setting_update(query_body: Mapping[str, Any]): async def query2sql_setting_update(query_body: Mapping[str, Any]):
if 'sqlExamplars' not in query_body: if "sqlExamplars" not in query_body:
raise HTTPException(status_code=400, raise HTTPException(status_code=400, detail="sqlExamplars is not in query_body")
detail="sqlExamplars is not in query_body")
else: else:
sql_examplars = query_body['sqlExamplars'] sql_examplars = query_body["sqlExamplars"]
if 'exampleNums' not in query_body: if "exampleNums" not in query_body:
raise HTTPException(status_code=400, detail="exampleNums is not in query_body") raise HTTPException(status_code=400, detail="exampleNums is not in query_body")
else: else:
example_nums = query_body['exampleNums'] example_nums = query_body["exampleNums"]
if 'isShortcut' not in query_body: if "isShortcut" not in query_body:
raise HTTPException(status_code=400, detail="isShortcut is not in query_body") raise HTTPException(status_code=400, detail="isShortcut is not in query_body")
else: else:
is_shortcut = query_body['isShortcut'] is_shortcut = query_body["isShortcut"]
text2sql_agent.update_examples(sql_examples=sql_examplars, example_nums=example_nums, is_shortcut=is_shortcut) text2sql_agent.update_examples(
sql_examples=sql_examplars, example_nums=example_nums, is_shortcut=is_shortcut
)
return "success" return "success"
@app.post("/preset_query_retrival/") @app.post("/preset_query_retrival/")
async def preset_query_retrival(query_text_list: List[str], n_results: int = 5): async def preset_query_retrival(query_text_list: List[str], n_results: int = 5):
parsed_retrieval_res_format = preset_query_retrieval_run(preset_query_collection, query_text_list, n_results) parsed_retrieval_res_format = preset_query_retrieval_run(
preset_query_collection, query_text_list, n_results
)
return parsed_retrieval_res_format return parsed_retrieval_res_format
@@ -93,27 +103,32 @@ async def preset_query_add(preset_info_list: List[Mapping[str, str]]):
preset_query_ids = [] preset_query_ids = []
for preset_info in preset_info_list: for preset_info in preset_info_list:
preset_queries.append(preset_info['preset_query']) preset_queries.append(preset_info["preset_query"])
preset_query_ids.append(preset_info['preset_query_id']) preset_query_ids.append(preset_info["preset_query_id"])
add2preset_query_collection(collection=preset_query_collection, add2preset_query_collection(
preset_queries=preset_queries, collection=preset_query_collection,
preset_query_ids=preset_query_ids) preset_queries=preset_queries,
preset_query_ids=preset_query_ids,
)
return "success" return "success"
@app.post("/preset_query_update/") @app.post("/preset_query_update/")
async def preset_query_update(preset_info_list: List[Mapping[str, str]]): async def preset_query_update(preset_info_list: List[Mapping[str, str]]):
preset_queries = [] preset_queries = []
preset_query_ids = [] preset_query_ids = []
for preset_info in preset_info_list: for preset_info in preset_info_list:
preset_queries.append(preset_info['preset_query']) preset_queries.append(preset_info["preset_query"])
preset_query_ids.append(preset_info['preset_query_id']) preset_query_ids.append(preset_info["preset_query_id"])
update_preset_query_collection(collection=preset_query_collection, update_preset_query_collection(
preset_queries=preset_queries, collection=preset_query_collection,
preset_query_ids=preset_query_ids) preset_queries=preset_queries,
preset_query_ids=preset_query_ids,
)
return "success" return "success"
@@ -124,39 +139,50 @@ async def preset_query_empty():
return "success" return "success"
@app.post("/preset_delete_by_ids/") @app.post("/preset_delete_by_ids/")
async def preset_delete_by_ids(preset_query_ids: List[str]): async def preset_delete_by_ids(preset_query_ids: List[str]):
delete_preset_query_by_ids(collection=preset_query_collection, preset_query_ids=preset_query_ids) delete_preset_query_by_ids(
collection=preset_query_collection, preset_query_ids=preset_query_ids
)
return "success" return "success"
@app.post("/preset_get_by_ids/") @app.post("/preset_get_by_ids/")
async def preset_get_by_ids(preset_query_ids: List[str]): async def preset_get_by_ids(preset_query_ids: List[str]):
preset_queries = get_preset_query_by_ids(collection=preset_query_collection, preset_query_ids=preset_query_ids) preset_queries = get_preset_query_by_ids(
collection=preset_query_collection, preset_query_ids=preset_query_ids
)
return preset_queries return preset_queries
@app.get("/preset_query_size/") @app.get("/preset_query_size/")
async def preset_query_size(): async def preset_query_size():
size = preset_query_collection_size(collection=preset_query_collection) size = preset_query_collection_size(collection=preset_query_collection)
return size return size
@app.post("/plugin_selection/") @app.post("/plugin_selection/")
async def tool_selection(query_body: Mapping[str, Any]): async def tool_selection(query_body: Mapping[str, Any]):
if 'queryText' not in query_body: if "queryText" not in query_body:
raise HTTPException(status_code=400, detail="query_text is not in query_body") raise HTTPException(status_code=400, detail="query_text is not in query_body")
else: else:
query_text = query_body['queryText'] query_text = query_body["queryText"]
if 'pluginConfigs' not in query_body: if "pluginConfigs" not in query_body:
raise HTTPException(status_code=400, detail="pluginConfigs is not in query_body") raise HTTPException(
status_code=400, detail="pluginConfigs is not in query_body"
)
else: else:
plugin_configs = query_body['pluginConfigs'] plugin_configs = query_body["pluginConfigs"]
resp = plugin_selection_run(query_text=query_text, plugin_configs=plugin_configs) resp = plugin_selection_run(query_text=query_text, plugin_configs=plugin_configs)
return resp return resp
if __name__ == "__main__": if __name__ == "__main__":
uvicorn.run(app, host=LLMPARSER_HOST, port=LLMPARSER_PORT) uvicorn.run(app, host=LLMPARSER_HOST, port=LLMPARSER_PORT)

View File

@@ -7,13 +7,15 @@ from chromadb.config import Settings
from run_config import CHROMA_DB_PERSIST_PATH from run_config import CHROMA_DB_PERSIST_PATH
client = chromadb.Client(Settings( client = chromadb.Client(
chroma_db_impl="duckdb+parquet", Settings(
persist_directory=CHROMA_DB_PERSIST_PATH # Optional, defaults to .chromadb/ in the current directory chroma_db_impl="duckdb+parquet",
)) persist_directory=CHROMA_DB_PERSIST_PATH, # Optional, defaults to .chromadb/ in the current directory
)
)
def empty_chroma_collection_2(collection:Collection): def empty_chroma_collection_2(collection: Collection):
collection_name = collection.name collection_name = collection.name
client = collection._client client = collection._client
metadata = collection.metadata metadata = collection.metadata
@@ -21,17 +23,18 @@ def empty_chroma_collection_2(collection:Collection):
client.delete_collection(collection_name) client.delete_collection(collection_name)
new_collection = client.get_or_create_collection(name=collection_name, new_collection = client.get_or_create_collection(
metadata=metadata, name=collection_name, metadata=metadata, embedding_function=embedding_function
embedding_function=embedding_function) )
size_of_new_collection = new_collection.count() size_of_new_collection = new_collection.count()
print(f'Collection {collection_name} emptied. Size of new collection: {size_of_new_collection}') print(
f"Collection {collection_name} emptied. Size of new collection: {size_of_new_collection}"
)
return new_collection return new_collection
def empty_chroma_collection(collection:Collection): def empty_chroma_collection(collection: Collection):
collection.delete() collection.delete()

View File

@@ -4,5 +4,6 @@ from langchain.llms import OpenAI
from run_config import MODEL_NAME, OPENAI_API_KEY, TEMPERATURE from run_config import MODEL_NAME, OPENAI_API_KEY, TEMPERATURE
llm = OpenAI(openai_api_key=OPENAI_API_KEY, model_name=MODEL_NAME, llm = OpenAI(
temperature=TEMPERATURE) openai_api_key=OPENAI_API_KEY, model_name=MODEL_NAME, temperature=TEMPERATURE
)

View File

@@ -9,6 +9,7 @@ from run_config import HF_TEXT2VEC_MODEL_NAME
hg_embedding = HuggingFaceEmbeddings(model_name=HF_TEXT2VEC_MODEL_NAME) hg_embedding = HuggingFaceEmbeddings(model_name=HF_TEXT2VEC_MODEL_NAME)
class Text2VecEmbeddingFunction(EmbeddingFunction): class Text2VecEmbeddingFunction(EmbeddingFunction):
def __call__(self, texts: Documents) -> Embeddings: def __call__(self, texts: Documents) -> Embeddings:
@@ -16,13 +17,8 @@ class Text2VecEmbeddingFunction(EmbeddingFunction):
return embeddings return embeddings
def get_embeddings(documents:List[str]) -> List[List[float]]:
def get_embeddings(documents: List[str]) -> List[List[float]]:
embeddings = hg_embedding.embed_documents(documents) embeddings = hg_embedding.embed_documents(documents)
return embeddings return embeddings

32
dev/reformat Executable file
View File

@@ -0,0 +1,32 @@
#!/bin/bash
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
set -x
PROFILES="-P "
# python style checks rely on `black` in path
if ! command -v black &> /dev/null
then
echo "Skip Python lint since 'black' is not available. Please install 'black' by running 'pip install black==22.3.0'"
else
PROFILES="${PROFILES} spotless-python"
fi
mvn spotless:apply $PROFILES

43
pom.xml
View File

@@ -65,6 +65,11 @@
<mockito-inline.version>4.5.1</mockito-inline.version> <mockito-inline.version>4.5.1</mockito-inline.version>
<jsqlparser.version>4.5</jsqlparser.version> <jsqlparser.version>4.5</jsqlparser.version>
<revision>0.7.5-SNAPSHOT</revision> <revision>0.7.5-SNAPSHOT</revision>
<!-- Do not bump spotless plugin version since 2.30.0 is the latest version supports Java 8-->
<maven.plugin.spotless.version>2.30.0</maven.plugin.spotless.version>
<spotless.python.includes></spotless.python.includes>
<!-- Do not bump black version as decided by spotless maven plugin-->
<spotless.python.black.version>22.3.0</spotless.python.black.version>
</properties> </properties>
<dependencyManagement> <dependencyManagement>
@@ -101,6 +106,15 @@
</dependencies> </dependencies>
</dependencyManagement> </dependencyManagement>
<profiles>
<profile>
<id>spotless-python</id>
<properties>
<spotless.python.includes>src/**/*.py</spotless.python.includes>
</properties>
</profile>
</profiles>
<build> <build>
<plugins> <plugins>
<plugin> <plugin>
@@ -147,6 +161,10 @@
<groupId>org.apache.maven.plugins</groupId> <groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-checkstyle-plugin</artifactId> <artifactId>maven-checkstyle-plugin</artifactId>
</plugin> </plugin>
<plugin>
<groupId>com.diffplug.spotless</groupId>
<artifactId>spotless-maven-plugin</artifactId>
</plugin>
</plugins> </plugins>
<pluginManagement> <pluginManagement>
<plugins> <plugins>
@@ -185,6 +203,31 @@
</execution> </execution>
</executions> </executions>
</plugin> </plugin>
<plugin>
<groupId>com.diffplug.spotless</groupId>
<artifactId>spotless-maven-plugin</artifactId>
<version>${maven.plugin.spotless.version}</version>
<configuration>
<upToDateChecking>
<enabled>true</enabled>
</upToDateChecking>
<python>
<includes>
<include>${spotless.python.includes}</include>
</includes>
<black>
<version>${spotless.python.black.version}</version>
</black>
</python>
</configuration>
<executions>
<execution>
<goals>
<goal>check</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins> </plugins>
</pluginManagement> </pluginManagement>
</build> </build>