upgrade text2dsl module for date related parse. (#38)

* [improvement](llm) data de-identification for few-shots examples.

* [improvement](llm) add plugin-call and preset retrieval features.

* [fix](llm) remove config variables.

* [improvement][feature]upgrade text2dsl module for date related parse.

---------
This commit is contained in:
codescracker
2023-08-28 18:56:24 +08:00
committed by GitHub
parent acca5e4538
commit 0d65d03bee
10 changed files with 486 additions and 113 deletions

3
.gitignore vendored
View File

@@ -14,4 +14,5 @@ assembly/runtime/*
*.umi/
/assembly/deploy
/runtime
**/.flattened-pom.xml
**/.flattened-pom.xml
__pycache__/

View File

@@ -25,20 +25,31 @@ app = FastAPI()
@app.post("/query2sql/")
async def din_query2sql(query_body: Mapping[str, Any]):
if 'queryText' not in query_body:
raise HTTPException(status_code=400,
if 'queryText' not in query_body:
raise HTTPException(status_code=400,
detail="query_text is not in query_body")
else:
query_text = query_body['queryText']
else:
query_text = query_body['queryText']
if 'schema' not in query_body:
raise HTTPException(status_code=400, detail="schema is not in query_body")
else:
schema = query_body['schema']
if 'schema' not in query_body:
raise HTTPException(status_code=400, detail="schema is not in query_body")
else:
schema = query_body['schema']
resp = query2sql(query_text=query_text, schema=schema)
if 'currentDate' not in query_body:
raise HTTPException(status_code=400, detail="currentDate is not in query_body")
else:
current_date = query_body['currentDate']
return resp
if 'linking' not in query_body:
linking = None
else:
linking = query_body['linking']
resp = query2sql(query_text=query_text,
schema=schema, current_date=current_date, linking=linking)
return resp
@app.post("/preset_query_retrival/")

View File

@@ -0,0 +1,296 @@
examplars= [
{ "current_date":"2020-12-01",
"table_name":"内容库产品",
"fields_list":"""["部门", "模块", "用户名", "访问次数", "访问人数", "访问时长", "数据日期"]""",
"question":"比较jerryjzhang和lexluo在内容库的访问次数",
"prior_schema_links":"""['jerryjzhang'->用户名, 'lexluo'->用户名]""",
"analysis": """让我们一步一步地思考。在问题“比较jerryjzhang和lexluo在内容库的访问次数“中我们被问
“比较jerryjzhang和lexluo”所以我们需要column=[用户名]
”内容库的访问次数“所以我们需要column=[访问次数]
基于table和columns可能的cell values 是 = ['jerryjzhang', 'lexluo']。""",
"schema_links":"""["用户名", "访问次数", "'jerryjzhang'", "'lexluo'"]""",
"sql":"""select 用户名, 访问次数 from 内容库产品 where 用户名 in ('jerryjzhang', 'lexluo') and 数据日期 = '2020-12-01' """
},
{ "current_date":"2022-11-06",
"table_name":"内容库产品",
"fields_list":"""["部门", "模块", "用户名", "访问次数", "访问人数", "访问时长", "数据日期"]""",
"question":"内容库近12个月访问人数 按部门",
"prior_schema_links":"""[]""",
"analysis": """让我们一步一步地思考。在问题“内容库近12个月访问人数 按部门“中,我们被问:
”内容库近12个月“所以我们需要column=[数据日期]
“访问人数”所以我们需要column=[访问人数]
”按部门“所以我们需要column=[部门]
基于table和columns可能的cell values 是 = [12]。""",
"schema_links":"""["访问人数", "部门", "数据日期", 12]""",
"sql":"""select 部门, 数据日期, 访问人数 from 内容库产品 where datediff('month', 数据日期, '2022-11-06') <= 12 """
},
{ "current_date":"2023-04-21",
"table_name":"内容库产品",
"fields_list":"""["部门", "模块", "用户名", "访问次数", "访问人数", "访问时长", "数据日期"]""",
"question":"内容库内容合作部、生态业务部的访问时长",
"prior_schema_links":"""['内容合作部'->部门, '生态业务部'->部门]""",
"analysis": """让我们一步一步地思考。在问题“内容库内容合作部、生态业务部的访问时长“中,我们被问:
“访问时长”所以我们需要column=[访问时长]
”内容库内容合作部、生态业务部“所以我们需要column=[部门]
基于table和columns可能的cell values 是 = ['内容合作部', '生态业务部']。""",
"schema_links":"""["访问时长", "部门", "'内容合作部'", "'生态业务部'"]""",
"sql":"""select 部门, 访问时长 from 内容库产品 where 部门 in ('内容合作部', '生态业务部') and 数据日期 = '2023-04-21' """
},
{ "current_date":"2023-08-21",
"table_name":"优选",
"fields_list":"""["优选版权归属系", "付费模式", "结算播放份额", "付费用户结算播放份额", "数据日期"]""",
"question":"近3天阔景系TME结算播放份额",
"prior_schema_links":"""['阔景系'->优选版权归属系]""",
"analysis": """让我们一步一步地思考。在问题“近3天阔景系TME结算播放份额“中我们被问
“TME结算播放份额”所以我们需要column=[结算播放份额]
”阔景系“所以我们需要column=[优选版权归属系]
”近3天“所以我们需要column=[数据日期]
基于table和columns可能的cell values 是 = ['阔景系', 3]。""",
"schema_links":"""["结算播放份额", "优选版权归属系", "数据日期", "'阔景系'", 3]""",
"sql":"""select 优选版权归属系, 结算播放份额 from 优选 where 优选版权归属系 = '阔景系' and datediff('day', 数据日期, '2023-08-21') <= 3 """
},
{ "current_date":"2023-05-22",
"table_name":"歌曲库",
"fields_list":"""["是否音乐人歌曲", "Q音歌曲ID", "Q音歌曲MID", "歌曲名", "歌曲版本", "语种", "歌曲类型", "翻唱类型", "TME歌曲ID", "是否优选窄口径歌曲", "是否优选宽口径歌曲", "结算播放量", "运营播放量", "付费用户结算播放量", "历史累计结算播放量", "运营搜播量", "结算搜播量", "运营完播量", "运营推播量", "近7日复播率", "日均搜播量", "数据日期"]""",
"question":"对比近7天翻唱版和纯音乐的歌曲播放量",
"prior_schema_links":"""['纯音乐'->语种, '翻唱版'->歌曲版本]""",
"analysis": """让我们一步一步地思考。在问题“对比近3天翻唱版和纯音乐的歌曲播放量“中我们被问
“歌曲播放量”所以我们需要column=[结算播放量]
”翻唱版“所以我们需要column=[歌曲版本]
”和纯音乐的歌曲“所以我们需要column=[语种]
”近7天“所以我们需要column=[数据日期]
基于table和columns可能的cell values 是 = ['翻唱版', '纯音乐', 7]。""",
"schema_links":"""["结算播放量", "歌曲版本", "语种", "数据日期", "'翻唱版'", "'纯音乐'", 7]""",
"sql":"""select 歌曲版本, 语种, 结算播放量 from 歌曲库 where 歌曲版本 = '翻唱版' and 语种 = '纯音乐' and datediff('day', 数据日期, '2023-05-22') <= 7 """
},
{ "current_date":"2023-05-31",
"table_name":"艺人库",
"fields_list":"""["上下架状态", "歌手名", "歌手等级", "歌手类型", "歌手来源", "TME音乐人等级", "活跃区域", "年龄", "歌手才能", "歌手风格", "粉丝数", "抖音粉丝数", "网易粉丝数", "微博粉丝数", "网易歌曲数", "在架歌曲数", "网易分享数", "独占歌曲数", "网易在架歌曲评论数", "有播放量歌曲数", "数据日期"]""",
"question":"对比一下陈卓璇、孟美岐、赖美云的粉丝数",
"prior_schema_links":"""['1527896'->TME歌手ID, '1565463'->TME歌手ID, '2141459'->TME歌手ID]""",
"analysis": """让我们一步一步地思考。在问题“对比一下陈卓璇、孟美岐、赖美云的粉丝数“中,我们被问:
“粉丝数”所以我们需要column=[粉丝数]
”陈卓璇、孟美岐、赖美云“所以我们需要column=[歌手名]
基于table和columns可能的cell values 是 = ['陈卓璇', '孟美岐', '赖美云']。""",
"schema_links":"""["粉丝数", "歌手名", "'陈卓璇'", "'孟美岐'", "'赖美云'"]""",
"sql":"""select 歌手名, 粉丝数 from 艺人库 where 歌手名 in ('陈卓璇', '孟美岐', '赖美云') and 数据日期 = '2023-05-31' """
},
{ "current_date":"2023-07-31",
"table_name":"歌曲库",
"fields_list":"""["歌曲名", "歌曲版本", "歌曲类型", "TME歌曲ID", "是否优选窄口径歌曲", "是否优选宽口径歌曲", "是否音乐人歌曲", "网易歌曲ID", "Q音歌曲ID", "Q音歌曲MID", "结算播放量", "运营播放量", "分享量", "收藏量", "运营搜播量", "结算搜播量", "拉新用户数", "拉活用户数", "分享率", "结算播放份额", "数据日期"]""",
"question":"播放量大于1万的歌曲有多少",
"prior_schema_links":"""[]""",
"analysis": """让我们一步一步地思考。在问题“播放量大于1万的歌曲有多少“中我们被问
“歌曲有多少”所以我们需要column=[歌曲名]
”播放量大于1万的“所以我们需要column=[结算播放量]
基于table和columns可能的cell values 是 = [10000]。""",
"schema_links":"""["歌曲名", "结算播放量", 10000]""",
"sql":"""select 歌曲名 from 歌曲库 where 结算播放量 > 10000 and 数据日期 = '2023-07-31' """
},
{ "current_date":"2023-07-31",
"table_name":"内容库产品",
"fields_list":"""["用户名", "部门", "模块", "访问时长", "访问次数", "访问人数", "数据日期"]""",
"question":"内容库访问时长小于1小时且来自内容合作部的用户是哪些",
"prior_schema_links":"""['内容合作部'->部门]""",
"analysis": """让我们一步一步地思考。在问题“内容库访问时长小于1小时且来自内容合作部的用户是哪些“中我们被问
“用户是哪些”所以我们需要column=[用户名]
”内容合作部的“所以我们需要column=[部门]
”访问时长小于1小时“所以我们需要column=[访问时长]
基于table和columns可能的cell values 是 = ['内容合作部', 1]。""",
"schema_links":"""["用户名", "部门", "访问时长", "'内容合作部'", 1]""",
"sql":"""select 用户名 from 内容库产品 where 部门 = '内容合作部' and 访问时长 < 1 and 数据日期 = '2023-07-31' """
},
{ "current_date":"2023-08-31",
"table_name":"内容库产品",
"fields_list":"""["用户名", "部门", "模块", "访问时长", "访问次数", "访问人数", "数据日期"]""",
"question":"内容库pv最高的用户有哪些",
"prior_schema_links":"""[]""",
"analysis": """让我们一步一步地思考。在问题“内容库pv最高的用户有哪些“中我们被问
“用户有哪些”所以我们需要column=[用户名]
”pv最高的“所以我们需要column=[访问次数]
基于table和columns可能的cell values 是 = []。""",
"schema_links":"""["用户名", "访问次数"]""",
"sql":"""select 用户名 from 内容库产品 where 数据日期 = '2023-08-31' order by 访问次数 desc limit 10 """
},
{ "current_date":"2023-08-31",
"table_name":"艺人库",
"fields_list":"""["播放量层级", "播放量单调性", "播放量方差", "播放量突增类型", "播放量集中度", "歌手名", "歌手等级", "歌手类型", "歌手来源", "TME音乐人等级", "结算播放量", "运营播放量", "历史累计结算播放量", "有播放量歌曲数", "历史累计运营播放量", "付费用户结算播放量", "结算播放量占比", "运营播放份额", "免费用户结算播放占比", "完播量", "数据日期"]""",
"question":"近90天袁娅维播放量平均值是多少",
"prior_schema_links":"""['152789226'->TME歌手ID]""",
"analysis": """让我们一步一步地思考。在问题“近90天袁娅维播放量平均值是多少“中我们被问
“播放量平均值是多少”所以我们需要column=[结算播放量]
”袁娅维“所以我们需要column=[歌手名]
”近90天“所以我们需要column=[数据日期]
基于table和columns可能的cell values 是 = ['袁娅维', 90]。""",
"schema_links":"""["结算播放量", "歌手名", "数据日期", "'袁娅维'", 90]""",
"sql":"""select avg(结算播放量) from 艺人库 where 歌手名 = '袁娅维' and datediff('day', 数据日期, '2023-08-31') <= 90 """
},
{ "current_date":"2023-08-31",
"table_name":"艺人库",
"fields_list":"""["播放量层级", "播放量单调性", "播放量方差", "播放量突增类型", "播放量集中度", "歌手名", "歌手等级", "歌手类型", "歌手来源", "TME音乐人等级", "结算播放量", "运营播放量", "历史累计结算播放量", "有播放量歌曲数", "历史累计运营播放量", "付费用户结算播放量", "结算播放量占比", "运营播放份额", "免费用户结算播放占比", "完播量", "数据日期"]""",
"question":"周深近7天结算播放量总和是多少",
"prior_schema_links":"""['199509'->TME歌手ID]""",
"analysis": """让我们一步一步地思考。在问题“周深近7天结算播放量总和是多少“中我们被问
“结算播放量总和是多少”所以我们需要column=[结算播放量]
”周深“所以我们需要column=[歌手名]
”近7天“所以我们需要column=[数据日期]
基于table和columns可能的cell values 是 = ['周深', 7]。""",
"schema_links":"""["结算播放量", "歌手名", "数据日期", "'周深'", 7]""",
"sql":"""select sum(结算播放量) from 艺人库 where 歌手名 = '周深' and datediff('day', 数据日期, '2023-08-31') <= 7 """
},
{ "current_date":"2023-09-14",
"table_name":"内容库产品",
"fields_list":"""["部门", "模块", "用户名", "访问次数", "访问人数", "访问时长", "数据日期"]""",
"question":"内容库访问次数大于1k的部门是哪些",
"prior_schema_links":"""[]""",
"analysis": """让我们一步一步地思考。在问题“内容库访问次数大于1k的部门是哪些“中我们被问
“部门是哪些”所以我们需要column=[部门]
”访问次数大于1k的“所以我们需要column=[访问次数]
基于table和columns可能的cell values 是 = [1000]。""",
"schema_links":"""["部门", "访问次数", 1000]""",
"sql":"""select 部门 from 内容库产品 where 访问次数 > 1000 and 数据日期 = '2023-09-14' """
},
{ "current_date":"2023-09-18",
"table_name":"歌曲库",
"fields_list":"""["歌曲名", "TME歌手ID", "歌曲版本", "歌曲类型", "TME歌曲ID", "是否优选窄口径歌曲", "是否优选宽口径歌曲", "是否音乐人歌曲", "网易歌曲ID", "Q音歌曲ID", "Q音歌曲MID", "结算播放量", "运营播放量", "分享量", "收藏量", "运营搜播量", "结算搜播量", "拉新用户数", "拉活用户数", "分享率", "结算播放份额", "数据日期"]""",
"question":"陈奕迅唱的所有的播放量大于20k的孤勇者有哪些",
"prior_schema_links":"""['199509'->TME歌手ID, '1527123'->TME歌曲ID]""",
"analysis": """让我们一步一步地思考。在问题“陈奕迅唱的所有的播放量大于20k的孤勇者有哪些“中我们被问
“孤勇者有哪些”所以我们需要column=[歌曲名]
”播放量大于20k的“所以我们需要column=[结算播放量]
”陈奕迅唱的“所以我们需要column=[歌手名]
基于table和columns可能的cell values 是 = [20000, '陈奕迅', '孤勇者']。""",
"schema_links":"""["歌曲名", "结算播放量", "歌手名", 20000, "'陈奕迅'", "'孤勇者'"]""",
"sql":"""select 歌曲名 from 歌曲库 where 结算播放量 > 20000 and 歌手名 = '陈奕迅' and 歌曲名 = '孤勇者' and 数据日期 = '2023-09-18' """
},
{ "current_date":"2023-09-18",
"table_name":"歌曲库",
"fields_list":"""["歌曲名", "歌曲版本", "歌手名", "歌曲类型", "发布时间", "TME歌曲ID", "是否优选窄口径歌曲", "是否优选宽口径歌曲", "是否音乐人歌曲", "网易歌曲ID", "Q音歌曲ID", "Q音歌曲MID", "结算播放量", "运营播放量", "分享量", "收藏量", "运营搜播量", "结算搜播量", "拉新用户数", "拉活用户数", "分享率", "结算播放份额", "数据日期"]""",
"question":"周杰伦去年发布的歌曲有哪些",
"prior_schema_links":"""['23109'->TME歌手ID]""",
"analysis": """让我们一步一步地思考。在问题“周杰伦去年发布的歌曲有哪些“中,我们被问:
“歌曲有哪些”所以我们需要column=[歌曲名]
”去年发布的“所以我们需要column=[发布时间]
”周杰伦“所以我们需要column=[歌手名]
基于table和columns可能的cell values 是 = ['周杰伦', 1]。""",
"schema_links":"""["歌曲名", "发布时间", "歌手名", 1, "'周杰伦'"]""",
"sql":"""select 歌曲名 from 歌曲库 where datediff('year', 发布时间, '2023-09-18') <= 1 and 歌手名 = '周杰伦' and 数据日期 = '2023-09-18' """
},
{ "current_date":"2023-09-11",
"table_name":"艺人库",
"fields_list":"""["播放量层级", "播放量单调性", "播放量方差", "播放量突增类型", "播放量集中度", "歌手名", "歌手等级", "歌手类型", "歌手来源", "签约日期", "TME音乐人等级", "结算播放量", "运营播放量", "历史累计结算播放量", "有播放量歌曲数", "历史累计运营播放量", "付费用户结算播放量", "结算播放量占比", "运营播放份额", "免费用户结算播放占比", "完播量", "数据日期"]""",
"question":"我想要近半年签约的播放量前十的歌手有哪些",
"prior_schema_links":"""[]""",
"analysis": """让我们一步一步地思考。在问题“我想要近半年签约的播放量前十的歌手“中,我们被问:
“歌手有哪些”所以我们需要column=[歌手名]
”播放量前十的“所以我们需要column=[结算播放量]
”近半年签约的“所以我们需要column=[签约日期]
基于table和columns可能的cell values 是 = [0.5, 10]。""",
"schema_links":"""["歌手名", "结算播放量", "签约日期", 0.5, 10]""",
"sql":"""select 歌手名 from 艺人库 where datediff('year', 签约日期, '2023-09-11') <= 0.5 and 数据日期 = '2023-09-11' order by 结算播放量 desc limit 10"""
},
{ "current_date":"2023-08-12",
"table_name":"歌曲库",
"fields_list": """["发行日期", "歌曲语言", "歌曲来源", "歌曲流派", "歌曲名", "歌曲版本", "歌曲类型", "发行时间", "数据日期"]""",
"question":"最近一年发行的歌曲中有哪些在近7天播放超过一千万的",
"prior_schema_links":"""[]""",
"analysis": """让我们一步一步地思考。在问题“最近一年发行的歌曲中有哪些在近7天播放超过一千万的“中我们被问
“发行的歌曲中有哪些”所以我们需要column=[歌曲名]
”最近一年发行的“所以我们需要column=[发行日期]
”在近7天播放超过一千万的“所以我们需要column=[数据日期, 结算播放量]
基于table和columns可能的cell values 是 = [1, 10000000]""",
"schema_links":"""["歌曲名", "发行日期", "数据日期", "结算播放量", 1, 10000000]""",
"sql":"""select 歌曲名 from 歌曲库 where datediff('year', 发行日期, '2023-08-12') <= 1 and datediff('day', 数据日期, '2023-08-12') <= 7 and 结算播放量 > 10000000"""
},
{ "current_date":"2023-08-12",
"table_name":"歌曲库",
"fields_list": """["发行日期", "歌曲语言", "歌曲来源", "歌曲流派", "歌曲名", "歌曲版本", "歌曲类型", "发行时间", "数据日期"]""",
"question":"今年以来发行的歌曲中有哪些在近7天播放超过一千万的",
"prior_schema_links":"""[]""",
"analysis": """让我们一步一步地思考。在问题“今年以来发行的歌曲中有哪些在近7天播放超过一千万的“中我们被问
“发行的歌曲中有哪些”所以我们需要column=[歌曲名]
”今年以来发行的“所以我们需要column=[发行日期]
”在近7天播放超过一千万的“所以我们需要column=[数据日期, 结算播放量]
基于table和columns可能的cell values 是 = [0, 7, 10000000]""",
"schema_links":"""["歌曲名", "发行日期", "数据日期", "结算播放量", 0, 7, 10000000]""",
"sql":"""select 歌曲名 from 歌曲库 where datediff('year', 发行日期, '2023-08-12') <= 0 and datediff('day', 数据日期, '2023-08-12') <= 7 and 结算播放量 > 10000000"""
},
{ "current_date":"2023-08-12",
"table_name":"歌曲库",
"fields_list": """["发行日期", "歌曲语言", "歌曲来源", "歌曲流派", "歌曲名", "歌曲版本", "歌曲类型", "发行时间", "数据日期"]""",
"question":"2023年以来发行的歌曲中有哪些在近7天播放超过一千万的",
"prior_schema_links":"""['514129144'->TME歌曲ID]""",
"analysis": """让我们一步一步地思考。在问题“2023年以来发行的歌曲中有哪些在近7天播放超过一千万的“中我们被问
“发行的歌曲中有哪些”所以我们需要column=[歌曲名]
”2023年以来发行的“所以我们需要column=[发行日期]
”在近7天播放超过一千万的“所以我们需要column=[数据日期, 结算播放量]
基于table和columns可能的cell values 是 = [2023, 7, 10000000]""",
"schema_links":"""["歌曲名", "发行日期", "数据日期", "结算播放量", 2023, 7, 10000000]""",
"sql":"""select 歌曲名 from 歌曲库 where YEAR(发行日期) >= 2023 and datediff('day', 数据日期, '2023-08-12') <= 7 and 结算播放量 > 10000000"""
},
{ "current_date":"2023-08-01",
"table_name":"歌曲库",
"fields_list":"""["歌曲名", "歌曲版本", "歌手名", "歌曲类型", "发布时间", "TME歌曲ID", "是否优选窄口径歌曲", "是否优选宽口径歌曲", "是否音乐人歌曲", "网易歌曲ID", "Q音歌曲ID", "Q音歌曲MID", "结算播放量", "运营播放量", "分享量", "收藏量", "运营搜播量", "结算搜播量", "拉新用户数", "拉活用户数", "分享率", "结算播放份额", "数据日期"]""",
"question":"周杰伦2023年6月之后发布的歌曲有哪些",
"prior_schema_links":"""['23109'->TME歌手ID]""",
"analysis": """让我们一步一步地思考。在问题“周杰伦2023年6月之后发布的歌曲有哪些“中我们被问
“歌曲有哪些”所以我们需要column=[歌曲名]
”2023年6月之后发布的“所以我们需要column=[发布时间]
”周杰伦“所以我们需要column=[歌手名]
基于table和columns可能的cell values 是 = ['周杰伦', 2023, 6]。""",
"schema_links":"""["歌曲名", "发布时间", "歌手名", "周杰伦", 2023, 6]""",
"sql":"""select 歌曲名 from 歌曲库 where YEAR(发布时间) >= 2023 and MONTH(发布时间) >= 6 and 歌手名 = '周杰伦' and 数据日期 = '2023-08-01' """
},
{ "current_date":"2023-08-01",
"table_name":"歌曲库",
"fields_list":"""["歌曲名", "歌曲版本", "歌手名", "歌曲类型", "发布时间", "TME歌曲ID", "是否优选窄口径歌曲", "是否优选宽口径歌曲", "是否音乐人歌曲", "网易歌曲ID", "Q音歌曲ID", "Q音歌曲MID", "结算播放量", "运营播放量", "分享量", "收藏量", "运营搜播量", "结算搜播量", "拉新用户数", "拉活用户数", "分享率", "结算播放份额", "数据日期"]""",
"question":"邓紫棋在2023年1月5日之后发布的歌曲中有哪些播放量大于500W的",
"prior_schema_links":"""['2312311'->TME歌手ID]""",
"analysis": """让我们一步一步地思考。在问题“邓紫棋在2023年1月5日之后发布的歌曲中有哪些播放量大于500W的“中我们被问
“播放量大于500W的”所以我们需要column=[结算播放量]
”邓紫棋在2023年1月5日之后发布的“所以我们需要column=[发布时间]
”邓紫棋“所以我们需要column=[歌手名]
基于table和columns可能的cell values 是 = ['邓紫棋', 2023, 1, 5, 5000000]。""",
"schema_links":"""["结算播放量", "发布时间", "歌手名", "邓紫棋", 2023, 1, 5, 5000000]""",
"sql":"""select 歌曲名 from 歌曲库 where YEAR(发布时间) >= 2023 and MONTH(发布时间) >= 1 and DAY(发布时间) >= 5 and 歌手名 = '邓紫棋' and 结算播放量 > 5000000 and 数据日期 = '2023-08-01'"""
},
{ "current_date":"2023-09-17",
"table_name":"歌曲库",
"fields_list":"""["歌曲名", "歌曲版本", "歌手名", "歌曲类型", "发布时间", "TME歌曲ID", "是否优选窄口径歌曲", "是否优选宽口径歌曲", "是否音乐人歌曲", "网易歌曲ID", "Q音歌曲ID", "Q音歌曲MID", "结算播放量", "运营播放量", "分享量", "收藏量", "运营搜播量", "结算搜播量", "拉新用户数", "拉活用户数", "分享率", "结算播放份额", "数据日期"]""",
"question":"2023年6月以后张靓颖播放量大于200万的歌曲有哪些",
"prior_schema_links":"""['45453'->TME歌手ID]""",
"analysis": """让我们一步一步地思考。在问题“2023年6月以后张靓颖播放量大于200万的歌曲有哪些“中我们被问
“播放量大于200万的”所以我们需要column=[结算播放量]
”2023年6月以后张靓颖“所以我们需要column=[数据日期, 歌手名]
”歌曲有哪些“所以我们需要column=[歌曲名]
基于table和columns可能的cell values 是 = ['张靓颖', 2023, 6, 2000000]。""",
"schema_links":"""["结算播放量", "数据日期", "歌手名", "张靓颖", 2023, 6, 2000000]""",
"sql":"""select 歌曲名 from 歌曲库 where YEAR(数据日期) >= 2023 and MONTH(数据日期) >= 6 and 歌手名 = '张靓颖' and 结算播放量 > 2000000 """
},
{ "current_date":"2023-08-16",
"table_name":"歌曲库",
"fields_list":"""["歌曲名", "歌曲版本", "歌手名", "歌曲类型", "发布时间", "TME歌曲ID", "是否优选窄口径歌曲", "是否优选宽口径歌曲", "是否音乐人歌曲", "网易歌曲ID", "Q音歌曲ID", "Q音歌曲MID", "结算播放量", "运营播放量", "分享量", "收藏量", "运营搜播量", "结算搜播量", "拉新用户数", "拉活用户数", "分享率", "结算播放份额", "数据日期"]""",
"question":"2021年6月以后发布的李宇春的播放量大于20万的歌曲有哪些",
"prior_schema_links":"""['23109'->TME歌手ID]""",
"analysis": """让我们一步一步地思考。在问题“2021年6月以后发布的李宇春的播放量大于20万的歌曲有哪些“中我们被问
“播放量大于20万的”所以我们需要column=[结算播放量]
”2021年6月以后发布的“所以我们需要column=[发布时间]
”李宇春“所以我们需要column=[歌手名]
基于table和columns可能的cell values 是 = ['李宇春', 2021, 6, 200000]。""",
"schema_links":"""["结算播放量", "发布时间", "歌手名", "李宇春", 2021, 6, 200000]""",
"sql":"""select 歌曲名 from 歌曲库 where YEAR(发布时间) >= 2021 and MONTH(发布时间) >= 6 and 歌手名 = '李宇春' and 结算播放量 > 200000 and 数据日期 = '2023-08-16'"""
},
{ "current_date":"2023-08-16",
"table_name":"歌曲库",
"fields_list":"""["歌曲名", "歌曲版本", "歌手名", "歌曲类型", "发布时间", "TME歌曲ID", "是否优选窄口径歌曲", "是否优选宽口径歌曲", "是否音乐人歌曲", "网易歌曲ID", "Q音歌曲ID", "Q音歌曲MID", "结算播放量", "运营播放量", "分享量", "收藏量", "运营搜播量", "结算搜播量", "拉新用户数", "拉活用户数", "分享率", "结算播放份额", "数据日期"]""",
"question":"刘德华在1992年4月2日到2020年5月2日之间发布的播放量大于20万的歌曲有哪些",
"prior_schema_links":"""['4234234'->TME歌手ID]""",
"analysis": """让我们一步一步地思考。在问题“刘德华在1992年4月2日到2020年5月2日之间发布的播放量大于20万的歌曲有哪些“中我们被问
“播放量大于20万的”所以我们需要column=[结算播放量]
”1992年4月2日到2020年5月2日之间发布的“所以我们需要column=[发布时间]
”刘德华“所以我们需要column=[歌手名]
基于table和columns可能的cell values 是 = ['刘德华', 1992, 4, 2, 2020, 5, 2, 200000]。""",
"schema_links":"""["结算播放量", "发布时间", "歌手名", "刘德华", 1992, 4, 2, 2020, 5, 2, 200000]""",
"sql":"""select 歌曲名 from 歌曲库 where YEAR(发布时间) >= 1992 and MONTH(发布时间) >= 4 and DAY(发布时间) >= 2 and YEAR(发布时间) <= 2020 and MONTH(发布时间) <= 5 and DAY(发布时间) <= 2 and 歌手名 = '刘德华' and 结算播放量 > 200000 and 数据日期 = '2023-08-16'"""
}
]

View File

@@ -8,8 +8,7 @@ from typing import Any, List, Mapping, Optional, Union
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
import chromadb
from chromadb.config import Settings
from chromadb.api import Collection, Documents, Embeddings
from langchain.llms import OpenAI
@@ -21,13 +20,9 @@ from preset_query_db import (get_ids, add2preset_query_collection,
from util.text2vec import Text2VecEmbeddingFunction
from run_config import CHROMA_DB_PERSIST_PATH, PRESET_QUERY_COLLECTION_NAME
from util.chromadb_instance import client
client = chromadb.Client(Settings(
chroma_db_impl="duckdb+parquet",
persist_directory=CHROMA_DB_PERSIST_PATH # Optional, defaults to .chromadb/ in the current directory
))
emb_func = Text2VecEmbeddingFunction()
collection = client.get_or_create_collection(name=PRESET_QUERY_COLLECTION_NAME,
@@ -35,6 +30,8 @@ collection = client.get_or_create_collection(name=PRESET_QUERY_COLLECTION_NAME,
metadata={"hnsw:space": "cosine"}
) # Get a collection object from an existing collection, by name. If it doesn't exist, create it.
print("init_preset_query_collection_size: ", preset_query_collection_size(collection))
def preset_query_retrieval_run(collection:Collection, query_texts_list:List[str], n_results:int=5):
retrieval_res = query2preset_query_collection(collection=collection,

View File

@@ -9,6 +9,7 @@ TEMPERATURE = 0.0
CHROMA_DB_PERSIST_DIR = 'chm_db'
PRESET_QUERY_COLLECTION_NAME = "preset_query_collection"
TEXT2DSL_COLLECTION_NAME = "text2dsl_collection"
CHROMA_DB_PERSIST_PATH = os.path.join(PROJECT_DIR_PATH, CHROMA_DB_PERSIST_DIR)

View File

@@ -0,0 +1,53 @@
# -*- coding:utf-8 -*-
import os
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
from langchain.prompts.few_shot import FewShotPromptTemplate
from langchain.prompts import PromptTemplate
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings, HuggingFaceEmbeddings
from langchain.prompts.example_selector import SemanticSimilarityExampleSelector
import chromadb
from chromadb.config import Settings
from few_shot_example.sql_exampler import examplars as din_sql_examplars
from util.text2vec import Text2VecEmbeddingFunction, hg_embedding
from util.chromadb_instance import client as chromadb_client
from run_config import TEXT2DSL_COLLECTION_NAME
vectorstore = Chroma(collection_name=TEXT2DSL_COLLECTION_NAME,
embedding_function=hg_embedding,
client=chromadb_client)
example_nums = 15
schema_linking_example_selector = SemanticSimilarityExampleSelector(vectorstore=vectorstore, k=example_nums,
input_keys=["question"],
example_keys=["table_name", "fields_list", "prior_schema_links", "question", "analysis", "schema_links"])
sql_example_selector = SemanticSimilarityExampleSelector(vectorstore=vectorstore, k=example_nums,
input_keys=["question"],
example_keys=["question", "current_date", "table_name", "schema_links", "sql"])
if vectorstore._collection.count() > 0:
print("examples already in din_sql_vectorstore")
print("init din_sql_vectorstore size:", vectorstore._collection.count())
if vectorstore._collection.count() < len(din_sql_examplars):
print("din_sql_examplars size:", len(din_sql_examplars))
vectorstore._collection.delete()
print("empty din_sql_vectorstore")
for example in din_sql_examplars:
schema_linking_example_selector.add_example(example)
print("added din_sql_vectorstore size:", vectorstore._collection.count())
else:
for example in din_sql_examplars:
schema_linking_example_selector.add_example(example)
print("added din_sql_vectorstore size:", vectorstore._collection.count())

View File

@@ -1,15 +1,13 @@
# -*- coding:utf-8 -*-
import re
def schema_link_parse(schema_link_output):
try:
schema_link_output = schema_link_output.strip()
pattern = r'Schema_links:(.*)'
schema_link_output = re.findall(pattern, schema_link_output, re.DOTALL)[
0].strip()
except Exception as e:
print(e)
schema_link_output = None
try:
schema_link_output = schema_link_output.strip()
pattern = r'Schema_links:(.*)'
schema_link_output = re.findall(pattern, schema_link_output, re.DOTALL)[0].strip()
except Exception as e:
print(e)
schema_link_output = None
return schema_link_output
return schema_link_output

View File

@@ -1,8 +1,5 @@
# -*- coding:utf-8 -*-
from typing import Any, List, Mapping, Optional, Union
import requests
import logging
import json
import os
import sys
@@ -11,78 +8,68 @@ sys.path.append(os.path.dirname(os.path.abspath(__file__)))
from langchain.prompts import PromptTemplate
from langchain.prompts.few_shot import FewShotPromptTemplate
from langchain.llms import OpenAI
from few_shot_example.sql_exampler import examplars
from output_parser import schema_link_parse
def schema_linking_prompt_maker(user_query: str, model_name: str,
fields_list: List[str],
few_shots_example: str):
instruction = "# 根据数据库的表结构,找出为每个问题生成SQL查询语句的schema_links\n"
schema_linking_prompt = "Table {table_name}, columns = {fields_list}\n问题:{user_query}\n分析: 让我们一步一步地思考。".format(
table_name=model_name,
fields_list=fields_list,
user_query=user_query)
return instruction + few_shots_example + schema_linking_prompt
from langchain.prompts.example_selector import SemanticSimilarityExampleSelector
def schema_linking_exampler(user_query: str,
model_name: str,
fields_list: List[str]
) -> str:
example_prompt_template = PromptTemplate(
input_variables=["table_name", "fields_list", "question", "analysis",
"schema_links"],
template="Table {table_name}, columns = {fields_list}\n问题:{question}\n分析:{analysis} 所以Schema_links是:\nSchema_links:{schema_links}")
domain_name: str,
fields_list: List[str],
prior_schema_links: Mapping[str,str],
example_selector: SemanticSimilarityExampleSelector,
) -> str:
instruction = "# 根据数据库的表结构,找出为每个问题生成SQL查询语句的schema_links"
prior_schema_links_str = '['+ ','.join(["""'{}'->{}""".format(k,v) for k,v in prior_schema_links.items()]) + ']'
schema_linking_prompt = "Table {table_name}, columns = {fields_list}\n问题:{question}\n分析: 让我们一步一步地思考。"
example_prompt_template = PromptTemplate(input_variables=["table_name", "fields_list", "prior_schema_links", "question", "analysis", "schema_links"],
template="Table {table_name}, columns = {fields_list}, prior_schema_links = {prior_schema_links}\n问题:{question}\n分析:{analysis} 所以Schema_links是:\nSchema_links:{schema_links}")
schema_linking_example_prompt_template = FewShotPromptTemplate(
examples=examplars,
example_prompt=example_prompt_template,
example_separator="\n\n",
prefix=instruction,
input_variables=["table_name", "fields_list", "question"],
suffix=schema_linking_prompt
)
instruction = "# 根据数据库的表结构,参考先验信息,找出为每个问题生成SQL查询语句的schema_links"
schema_linking_example_prompt = schema_linking_example_prompt_template.format(
table_name=model_name,
fields_list=fields_list,
question=user_query)
schema_linking_prompt = "Table {table_name}, columns = {fields_list}, prior_schema_links = {prior_schema_links}\n问题:{question}\n分析: 让我们一步一步地思考。"
return schema_linking_example_prompt
schema_linking_example_prompt_template = FewShotPromptTemplate(
example_selector=example_selector,
example_prompt=example_prompt_template,
example_separator="\n\n",
prefix=instruction,
input_variables=["table_name", "fields_list", "prior_schema_links", "question"],
suffix=schema_linking_prompt
)
schema_linking_example_prompt = schema_linking_example_prompt_template.format(table_name=domain_name,
fields_list=fields_list,
prior_schema_links=prior_schema_links_str,
question=user_query)
return schema_linking_example_prompt
def sql_exampler(user_query: str,
model_name: str,
schema_link_str: str
) -> str:
instruction = "# 根据schema_links为每个问题生成SQL查询语句"
domain_name: str,
schema_link_str: str,
data_date: str,
example_selector: SemanticSimilarityExampleSelector,
) -> str:
instruction = "# 根据schema_links为每个问题生成SQL查询语句"
sql_example_prompt_template = PromptTemplate(
input_variables=["question", "table_name", "schema_links", "sql"],
template="问题:{question}\nTable {table_name}\nSchema_links:{schema_links}\nSQL:{sql}")
sql_example_prompt_template = PromptTemplate(input_variables=["question", "current_date", "table_name", "schema_links", "sql"],
template="问题:{question}\nCurrent_date:{current_date}\nTable {table_name}\nSchema_links:{schema_links}\nSQL:{sql}")
sql_prompt = "问题:{question}\nTable {table_name}\nSchema_links:{schema_links}\nSQL:"
sql_prompt = "问题:{question}\nCurrent_date:{current_date}\nTable {table_name}\nSchema_links:{schema_links}\nSQL:"
sql_example_prompt_template = FewShotPromptTemplate(
examples=examplars,
example_prompt=sql_example_prompt_template,
example_separator="\n\n",
prefix=instruction,
input_variables=["question", "table_name", "schema_links"],
suffix=sql_prompt
)
sql_example_prompt_template = FewShotPromptTemplate(
example_selector=example_selector,
example_prompt=sql_example_prompt_template,
example_separator="\n\n",
prefix=instruction,
input_variables=["question", "current_date", "table_name", "schema_links"],
suffix=sql_prompt
)
sql_example_prompt = sql_example_prompt_template.format(question=user_query,
table_name=model_name,
schema_links=schema_link_str)
sql_example_prompt = sql_example_prompt_template.format(question=user_query,
current_date=data_date,
table_name=domain_name,
schema_links=schema_link_str)
return sql_example_prompt
return sql_example_prompt

View File

@@ -1,6 +1,4 @@
# -*- coding:utf-8 -*-
from typing import List, Union
from typing import List, Union, Mapping
import logging
import json
import os
@@ -9,33 +7,54 @@ import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
from sql.prompt_maker import schema_linking_exampler, schema_link_parse, \
sql_exampler
from sql.prompt_maker import schema_linking_exampler, sql_exampler
from sql.constructor import schema_linking_example_selector, sql_example_selector
from sql.output_parser import schema_link_parse
from util.llm_instance import llm
def query2sql(query_text: str, schema: dict):
print("schema: ", schema)
model_name = schema['modelName']
fields_list = schema['fieldNameList']
def query2sql(query_text: str,
schema : Union[dict, None] = None,
current_date: str = None,
linking: Union[List[Mapping[str, str]], None] = None
):
print("query_text: ", query_text)
print("schema: ", schema)
print("current_date: ", current_date)
print("prior_schema_links: ", linking)
schema_linking_prompt = schema_linking_exampler(query_text, model_name,
fields_list)
schema_link_output = llm(schema_linking_prompt)
schema_link_str = schema_link_parse(schema_link_output)
if linking is not None:
prior_schema_links = {item['fieldValue']:item['fieldName'] for item in linking}
else:
prior_schema_links = {}
sql_prompt = sql_exampler(query_text, model_name, schema_link_str)
sql_output = llm(sql_prompt)
model_name = schema['modelName']
fields_list = schema['fieldNameList']
resp = dict()
resp['query'] = query_text
resp['model'] = model_name
resp['fields'] = fields_list
schema_linking_prompt = schema_linking_exampler(query_text, model_name, fields_list, prior_schema_links, schema_linking_example_selector)
print("schema_linking_prompt->", schema_linking_prompt)
schema_link_output = llm(schema_linking_prompt)
schema_link_str = schema_link_parse(schema_link_output)
sql_prompt = sql_exampler(query_text, model_name, schema_link_str, current_date, sql_example_selector)
print("sql_prompt->", sql_prompt)
sql_output = llm(sql_prompt)
resp['schemaLinkingOutput'] = schema_link_output
resp['schemaLinkStr'] = schema_link_str
resp = dict()
resp['query'] = query_text
resp['model'] = model_name
resp['fields'] = fields_list
resp['priorSchemaLinking'] = linking
resp['dataDate'] = current_date
resp['sqlOutput'] = sql_output
resp['schemaLinkingOutput'] = schema_link_output
resp['schemaLinkStr'] = schema_link_str
resp['sqlOutput'] = sql_output
print("resp: ", resp)
return resp
return resp

View File

@@ -0,0 +1,10 @@
# -*- coding:utf-8 -*-
import chromadb
from chromadb.config import Settings
from run_config import CHROMA_DB_PERSIST_PATH
client = chromadb.Client(Settings(
chroma_db_impl="duckdb+parquet",
persist_directory=CHROMA_DB_PERSIST_PATH # Optional, defaults to .chromadb/ in the current directory
))