diff --git a/assembly/bin/build-chat.sh b/assembly/bin/build-chat.sh
index e00ab2fa9..6305f9d8a 100755
--- a/assembly/bin/build-chat.sh
+++ b/assembly/bin/build-chat.sh
@@ -1,7 +1,7 @@
#!/usr/bin/env bash
sbinDir=$(cd "$(dirname "$0")"; pwd)
-baseDir=$(readlink -f $sbinDir/../)
+baseDir=$(cd "$sbinDir/.." && pwd -P)
runtimeDir=$baseDir/runtime
buildDir=$baseDir/build
diff --git a/assembly/bin/build-ide.sh b/assembly/bin/build-ide.sh
index b2bcd0ca3..87bfda75e 100755
--- a/assembly/bin/build-ide.sh
+++ b/assembly/bin/build-ide.sh
@@ -1,7 +1,7 @@
#!/usr/bin/env bash
sbinDir=$(cd "$(dirname "$0")"; pwd)
-baseDir=$(readlink -f $sbinDir/../)
+baseDir=$(cd "$sbinDir/.." && pwd -P)
buildDir=$baseDir/build
cd $baseDir/bin
diff --git a/assembly/bin/build-semantic.sh b/assembly/bin/build-semantic.sh
index 5eefb3845..1c969081b 100755
--- a/assembly/bin/build-semantic.sh
+++ b/assembly/bin/build-semantic.sh
@@ -1,7 +1,7 @@
#!/usr/bin/env bash
sbinDir=$(cd "$(dirname "$0")"; pwd)
-baseDir=$(readlink -f $sbinDir/../)
+baseDir=$(cd "$sbinDir/.." && pwd -P)
runtimeDir=$baseDir/runtime
buildDir=$baseDir/build
diff --git a/assembly/bin/build-standalone.sh b/assembly/bin/build-standalone.sh
index efa2324e8..b14435eca 100755
--- a/assembly/bin/build-standalone.sh
+++ b/assembly/bin/build-standalone.sh
@@ -1,7 +1,7 @@
#!/usr/bin/env bash
sbinDir=$(cd "$(dirname "$0")"; pwd)
-baseDir=$(readlink -f $sbinDir/../)
+baseDir=$(cd "$sbinDir/.." && pwd -P)
runtimeDir=$baseDir/runtime
buildDir=$baseDir/build
diff --git a/assembly/bin/start-chat.sh b/assembly/bin/start-chat.sh
index 87e1920b4..0ee1435fe 100755
--- a/assembly/bin/start-chat.sh
+++ b/assembly/bin/start-chat.sh
@@ -1,7 +1,7 @@
#!/usr/bin/env bash
sbinDir=$(cd "$(dirname "$0")"; pwd)
-baseDir=$(readlink -f $sbinDir/../)
+baseDir=$(cd "$sbinDir/.." && pwd -P)
runtimeDir=$baseDir/../runtime
buildDir=$baseDir/build
diff --git a/assembly/bin/start-semantic.sh b/assembly/bin/start-semantic.sh
index 1c070b83b..0175de675 100755
--- a/assembly/bin/start-semantic.sh
+++ b/assembly/bin/start-semantic.sh
@@ -1,7 +1,7 @@
#!/usr/bin/env bash
sbinDir=$(cd "$(dirname "$0")"; pwd)
-baseDir=$(readlink -f $sbinDir/../)
+baseDir=$(cd "$sbinDir/.." && pwd -P)
runtimeDir=$baseDir/../runtime
buildDir=$baseDir/build
diff --git a/assembly/bin/start-standalone.sh b/assembly/bin/start-standalone.sh
index 80fb3099d..3cb9aff0f 100755
--- a/assembly/bin/start-standalone.sh
+++ b/assembly/bin/start-standalone.sh
@@ -1,7 +1,7 @@
#!/usr/bin/env bash
sbinDir=$(cd "$(dirname "$0")"; pwd)
-baseDir=$(readlink -f $sbinDir/../)
+baseDir=$(cd "$sbinDir/.." && pwd -P)
runtimeDir=$baseDir/../runtime
buildDir=$baseDir/build
@@ -29,4 +29,4 @@ rm -fr ${buildDir}/supersonic-webapp
#start standalone service
sh ${runtimeDir}/supersonic-standalone/bin/service.sh restart
#start llm service
-sh ${runtimeDir}/supersonic-standalone/llm/bin/service.sh restart
\ No newline at end of file
+sh ${runtimeDir}/supersonic-standalone/llm/bin/service.sh restart
diff --git a/auth/api/src/main/java/com/tencent/supersonic/auth/api/authentication/config/AuthenticationConfig.java b/auth/api/src/main/java/com/tencent/supersonic/auth/api/authentication/config/AuthenticationConfig.java
index 78c7cf080..05d1a8991 100644
--- a/auth/api/src/main/java/com/tencent/supersonic/auth/api/authentication/config/AuthenticationConfig.java
+++ b/auth/api/src/main/java/com/tencent/supersonic/auth/api/authentication/config/AuthenticationConfig.java
@@ -19,7 +19,7 @@ public class AuthenticationConfig {
@Value("${authentication.token.secret:secret}")
private String tokenSecret;
- @Value("${authentication.token.http.header.key:Auth}")
+ @Value("${authentication.token.http.header.key:Authorization}")
private String tokenHttpHeaderKey;
diff --git a/chat/core/src/main/python/bin/install.sh b/chat/core/src/main/python/bin/install.sh
index df8800a19..9e68a6fd2 100755
--- a/chat/core/src/main/python/bin/install.sh
+++ b/chat/core/src/main/python/bin/install.sh
@@ -1,7 +1,7 @@
#!/usr/bin/env bash
binDir=$(cd "$(dirname "$0")"; pwd)
-baseDir=$(readlink -f $binDir/../)
+baseDir=$(cd "$binDir/.." && pwd -P)
echo $binDir
source ${binDir}/env.sh
diff --git a/chat/core/src/main/python/bin/run.sh b/chat/core/src/main/python/bin/run.sh
index 8ed1a9d90..8f7af2f23 100755
--- a/chat/core/src/main/python/bin/run.sh
+++ b/chat/core/src/main/python/bin/run.sh
@@ -4,7 +4,7 @@ llm_host=$1
llm_port=$2
binDir=$(cd "$(dirname "$0")"; pwd)
-baseDir=$(readlink -f $binDir/../)
+baseDir=$(cd "$binDir/.." && pwd -P)
source ${baseDir}/bin/env.sh
diff --git a/chat/core/src/main/python/bin/service.sh b/chat/core/src/main/python/bin/service.sh
index bd3ae8e0a..a1788b5fb 100755
--- a/chat/core/src/main/python/bin/service.sh
+++ b/chat/core/src/main/python/bin/service.sh
@@ -1,7 +1,7 @@
#!/usr/bin/env bash
binDir=$(cd "$(dirname "$0")"; pwd)
-baseDir=$(readlink -f $binDir/../)
+baseDir=$(cd "$binDir/.." && pwd -P)
source ${baseDir}/bin/env.sh
command=$1
diff --git a/chat/core/src/main/python/llm/api_service.py b/chat/core/src/main/python/llm/api_service.py
index a5bd975f2..c06748b74 100644
--- a/chat/core/src/main/python/llm/api_service.py
+++ b/chat/core/src/main/python/llm/api_service.py
@@ -25,20 +25,31 @@ app = FastAPI()
@app.post("/query2sql/")
async def din_query2sql(query_body: Mapping[str, Any]):
- if 'queryText' not in query_body:
- raise HTTPException(status_code=400,
+ if 'queryText' not in query_body:
+ raise HTTPException(status_code=400,
detail="query_text is not in query_body")
- else:
- query_text = query_body['queryText']
+ else:
+ query_text = query_body['queryText']
- if 'schema' not in query_body:
- raise HTTPException(status_code=400, detail="schema is not in query_body")
- else:
- schema = query_body['schema']
+ if 'schema' not in query_body:
+ raise HTTPException(status_code=400, detail="schema is not in query_body")
+ else:
+ schema = query_body['schema']
- resp = query2sql(query_text=query_text, schema=schema)
+ if 'currentDate' not in query_body:
+ raise HTTPException(status_code=400, detail="currentDate is not in query_body")
+ else:
+ current_date = query_body['currentDate']
- return resp
+ if 'linking' not in query_body:
+ linking = None
+ else:
+ linking = query_body['linking']
+
+ resp = query2sql(query_text=query_text,
+ schema=schema, current_date=current_date, linking=linking)
+
+ return resp
@app.post("/preset_query_retrival/")
diff --git a/chat/core/src/main/python/llm/few_shot_example/sql_exampler.py b/chat/core/src/main/python/llm/few_shot_example/sql_exampler.py
new file mode 100644
index 000000000..87a32dc3d
--- /dev/null
+++ b/chat/core/src/main/python/llm/few_shot_example/sql_exampler.py
@@ -0,0 +1,296 @@
+examplars= [
+ { "current_date":"2020-12-01",
+ "table_name":"内容库产品",
+ "fields_list":"""["部门", "模块", "用户名", "访问次数", "访问人数", "访问时长", "数据日期"]""",
+ "question":"比较jackjchen和robinlee在内容库的访问次数",
+ "prior_schema_links":"""['jackjchen'->用户名, 'robinlee'->用户名]""",
+ "analysis": """让我们一步一步地思考。在问题“比较jackjchen和robinlee在内容库的访问次数“中,我们被问:
+“比较jackjchen和robinlee”,所以我们需要column=[用户名]
+”内容库的访问次数“,所以我们需要column=[访问次数]
+基于table和columns,可能的cell values 是 = ['jackjchen', 'robinlee']。""",
+ "schema_links":"""["用户名", "访问次数", "'jackjchen'", "'robinlee'"]""",
+ "sql":"""select 用户名, 访问次数 from 内容库产品 where 用户名 in ('jackjchen', 'robinlee') and 数据日期 = '2020-12-01' """
+ },
+ { "current_date":"2022-11-06",
+ "table_name":"内容库产品",
+ "fields_list":"""["部门", "模块", "用户名", "访问次数", "访问人数", "访问时长", "数据日期"]""",
+ "question":"内容库近12个月访问人数 按部门",
+ "prior_schema_links":"""[]""",
+ "analysis": """让我们一步一步地思考。在问题“内容库近12个月访问人数 按部门“中,我们被问:
+”内容库近12个月“,所以我们需要column=[数据日期]
+“访问人数”,所以我们需要column=[访问人数]
+”按部门“,所以我们需要column=[部门]
+基于table和columns,可能的cell values 是 = [12]。""",
+ "schema_links":"""["访问人数", "部门", "数据日期", 12]""",
+ "sql":"""select 部门, 数据日期, 访问人数 from 内容库产品 where datediff('month', 数据日期, '2022-11-06') <= 12 """
+ },
+ { "current_date":"2023-04-21",
+ "table_name":"内容库产品",
+ "fields_list":"""["部门", "模块", "用户名", "访问次数", "访问人数", "访问时长", "数据日期"]""",
+ "question":"内容库美术部、技术研发部的访问时长",
+ "prior_schema_links":"""['美术部'->部门, '技术研发部'->部门]""",
+ "analysis": """让我们一步一步地思考。在问题“内容库美术部、技术研发部的访问时长“中,我们被问:
+“访问时长”,所以我们需要column=[访问时长]
+”内容库美术部、技术研发部“,所以我们需要column=[部门]
+基于table和columns,可能的cell values 是 = ['美术部', '技术研发部']。""",
+ "schema_links":"""["访问时长", "部门", "'美术部'", "'技术研发部'"]""",
+ "sql":"""select 部门, 访问时长 from 内容库产品 where 部门 in ('美术部', '技术研发部') and 数据日期 = '2023-04-21' """
+ },
+ { "current_date":"2023-08-21",
+ "table_name":"严选",
+ "fields_list":"""["严选版权归属系", "付费模式", "结算播放份额", "付费用户结算播放份额", "数据日期"]""",
+ "question":"近3天海田飞系MPPM结算播放份额",
+ "prior_schema_links":"""['海田飞系'->严选版权归属系]""",
+ "analysis": """让我们一步一步地思考。在问题“近3天海田飞系MPPM结算播放份额“中,我们被问:
+“MPPM结算播放份额”,所以我们需要column=[结算播放份额]
+”海田飞系“,所以我们需要column=[严选版权归属系]
+”近3天“,所以我们需要column=[数据日期]
+基于table和columns,可能的cell values 是 = ['海田飞系', 3]。""",
+ "schema_links":"""["结算播放份额", "严选版权归属系", "数据日期", "'海田飞系'", 3]""",
+ "sql":"""select 严选版权归属系, 结算播放份额 from 严选 where 严选版权归属系 = '海田飞系' and datediff('day', 数据日期, '2023-08-21') <= 3 """
+ },
+ { "current_date":"2023-05-22",
+ "table_name":"歌曲库",
+ "fields_list":"""["是否潮流人歌曲", "C音歌曲ID", "C音歌曲MID", "歌曲名", "歌曲版本", "语种", "歌曲类型", "翻唱类型", "MPPM歌曲ID", "是否严选窄口径歌曲", "是否严选宽口径歌曲", "结算播放量", "运营播放量", "付费用户结算播放量", "历史累计结算播放量", "运营搜播量", "结算搜播量", "运营完播量", "运营推播量", "近7日复播率", "日均搜播量", "数据日期"]""",
+ "question":"对比近7天翻唱版和纯音乐的歌曲播放量",
+ "prior_schema_links":"""['纯音乐'->语种, '翻唱版'->歌曲版本]""",
+ "analysis": """让我们一步一步地思考。在问题“对比近3天翻唱版和纯音乐的歌曲播放量“中,我们被问:
+“歌曲播放量”,所以我们需要column=[结算播放量]
+”翻唱版“,所以我们需要column=[歌曲版本]
+”和纯音乐的歌曲“,所以我们需要column=[语种]
+”近7天“,所以我们需要column=[数据日期]
+基于table和columns,可能的cell values 是 = ['翻唱版', '纯音乐', 7]。""",
+ "schema_links":"""["结算播放量", "歌曲版本", "语种", "数据日期", "'翻唱版'", "'纯音乐'", 7]""",
+ "sql":"""select 歌曲版本, 语种, 结算播放量 from 歌曲库 where 歌曲版本 = '翻唱版' and 语种 = '纯音乐' and datediff('day', 数据日期, '2023-05-22') <= 7 """
+ },
+ { "current_date":"2023-05-31",
+ "table_name":"艺人库",
+ "fields_list":"""["上下架状态", "歌手名", "歌手等级", "歌手类型", "歌手来源", "MPPM潮流人等级", "活跃区域", "年龄", "歌手才能", "歌手风格", "粉丝数", "潮音粉丝数", "超声波粉丝数", "推博粉丝数", "超声波歌曲数", "在架歌曲数", "超声波分享数", "独占歌曲数", "超声波在架歌曲评论数", "有播放量歌曲数", "数据日期"]""",
+ "question":"对比一下陈拙悬、孟梅琦、赖媚韵的粉丝数",
+ "prior_schema_links":"""['1527896'->MPPM歌手ID, '1565463'->MPPM歌手ID, '2141459'->MPPM歌手ID]""",
+ "analysis": """让我们一步一步地思考。在问题“对比一下陈拙悬、孟梅琦、赖媚韵的粉丝数“中,我们被问:
+“粉丝数”,所以我们需要column=[粉丝数]
+”陈拙悬、孟梅琦、赖媚韵“,所以我们需要column=[歌手名]
+基于table和columns,可能的cell values 是 = ['陈拙悬', '孟梅琦', '赖媚韵']。""",
+ "schema_links":"""["粉丝数", "歌手名", "'陈拙悬'", "'孟梅琦'", "'赖媚韵'"]""",
+ "sql":"""select 歌手名, 粉丝数 from 艺人库 where 歌手名 in ('陈拙悬', '孟梅琦', '赖媚韵') and 数据日期 = '2023-05-31' """
+ },
+ { "current_date":"2023-07-31",
+ "table_name":"歌曲库",
+ "fields_list":"""["歌曲名", "歌曲版本", "歌曲类型", "MPPM歌曲ID", "是否严选窄口径歌曲", "是否严选宽口径歌曲", "是否潮流人歌曲", "超声波歌曲ID", "C音歌曲ID", "C音歌曲MID", "结算播放量", "运营播放量", "分享量", "收藏量", "运营搜播量", "结算搜播量", "拉新用户数", "拉活用户数", "分享率", "结算播放份额", "数据日期"]""",
+ "question":"播放量大于1万的歌曲有多少",
+ "prior_schema_links":"""[]""",
+ "analysis": """让我们一步一步地思考。在问题“播放量大于1万的歌曲有多少“中,我们被问:
+“歌曲有多少”,所以我们需要column=[歌曲名]
+”播放量大于1万的“,所以我们需要column=[结算播放量]
+基于table和columns,可能的cell values 是 = [10000]。""",
+ "schema_links":"""["歌曲名", "结算播放量", 10000]""",
+ "sql":"""select 歌曲名 from 歌曲库 where 结算播放量 > 10000 and 数据日期 = '2023-07-31' """
+ },
+ { "current_date":"2023-07-31",
+ "table_name":"内容库产品",
+ "fields_list":"""["用户名", "部门", "模块", "访问时长", "访问次数", "访问人数", "数据日期"]""",
+ "question":"内容库访问时长小于1小时,且来自美术部的用户是哪些",
+ "prior_schema_links":"""['美术部'->部门]""",
+ "analysis": """让我们一步一步地思考。在问题“内容库访问时长小于1小时,且来自美术部的用户是哪些“中,我们被问:
+“用户是哪些”,所以我们需要column=[用户名]
+”美术部的“,所以我们需要column=[部门]
+”访问时长小于1小时“,所以我们需要column=[访问时长]
+基于table和columns,可能的cell values 是 = ['美术部', 1]。""",
+ "schema_links":"""["用户名", "部门", "访问时长", "'美术部'", 1]""",
+ "sql":"""select 用户名 from 内容库产品 where 部门 = '美术部' and 访问时长 < 1 and 数据日期 = '2023-07-31' """
+ },
+ { "current_date":"2023-08-31",
+ "table_name":"内容库产品",
+ "fields_list":"""["用户名", "部门", "模块", "访问时长", "访问次数", "访问人数", "数据日期"]""",
+ "question":"内容库pv最高的用户有哪些",
+ "prior_schema_links":"""[]""",
+ "analysis": """让我们一步一步地思考。在问题“内容库pv最高的用户有哪些“中,我们被问:
+“用户有哪些”,所以我们需要column=[用户名]
+”pv最高的“,所以我们需要column=[访问次数]
+基于table和columns,可能的cell values 是 = []。""",
+ "schema_links":"""["用户名", "访问次数"]""",
+ "sql":"""select 用户名 from 内容库产品 where 数据日期 = '2023-08-31' order by 访问次数 desc limit 10 """
+ },
+ { "current_date":"2023-08-31",
+ "table_name":"艺人库",
+ "fields_list":"""["播放量层级", "播放量单调性", "播放量方差", "播放量突增类型", "播放量集中度", "歌手名", "歌手等级", "歌手类型", "歌手来源", "MPPM潮流人等级", "结算播放量", "运营播放量", "历史累计结算播放量", "有播放量歌曲数", "历史累计运营播放量", "付费用户结算播放量", "结算播放量占比", "运营播放份额", "免费用户结算播放占比", "完播量", "数据日期"]""",
+ "question":"近90天袁亚伟播放量平均值是多少",
+ "prior_schema_links":"""['152789226'->MPPM歌手ID]""",
+ "analysis": """让我们一步一步地思考。在问题“近90天袁亚伟播放量平均值是多少“中,我们被问:
+“播放量平均值是多少”,所以我们需要column=[结算播放量]
+”袁亚伟“,所以我们需要column=[歌手名]
+”近90天“,所以我们需要column=[数据日期]
+基于table和columns,可能的cell values 是 = ['袁亚伟', 90]。""",
+ "schema_links":"""["结算播放量", "歌手名", "数据日期", "'袁亚伟'", 90]""",
+ "sql":"""select avg(结算播放量) from 艺人库 where 歌手名 = '袁亚伟' and datediff('day', 数据日期, '2023-08-31') <= 90 """
+ },
+ { "current_date":"2023-08-31",
+ "table_name":"艺人库",
+ "fields_list":"""["播放量层级", "播放量单调性", "播放量方差", "播放量突增类型", "播放量集中度", "歌手名", "歌手等级", "歌手类型", "歌手来源", "MPPM潮流人等级", "结算播放量", "运营播放量", "历史累计结算播放量", "有播放量歌曲数", "历史累计运营播放量", "付费用户结算播放量", "结算播放量占比", "运营播放份额", "免费用户结算播放占比", "完播量", "数据日期"]""",
+ "question":"周倩倩近7天结算播放量总和是多少",
+ "prior_schema_links":"""['199509'->MPPM歌手ID]""",
+ "analysis": """让我们一步一步地思考。在问题“周倩倩近7天结算播放量总和是多少“中,我们被问:
+“结算播放量总和是多少”,所以我们需要column=[结算播放量]
+”周倩倩“,所以我们需要column=[歌手名]
+”近7天“,所以我们需要column=[数据日期]
+基于table和columns,可能的cell values 是 = ['周倩倩', 7]。""",
+ "schema_links":"""["结算播放量", "歌手名", "数据日期", "'周倩倩'", 7]""",
+ "sql":"""select sum(结算播放量) from 艺人库 where 歌手名 = '周倩倩' and datediff('day', 数据日期, '2023-08-31') <= 7 """
+ },
+ { "current_date":"2023-09-14",
+ "table_name":"内容库产品",
+ "fields_list":"""["部门", "模块", "用户名", "访问次数", "访问人数", "访问时长", "数据日期"]""",
+ "question":"内容库访问次数大于1k的部门是哪些",
+ "prior_schema_links":"""[]""",
+ "analysis": """让我们一步一步地思考。在问题“内容库访问次数大于1k的部门是哪些“中,我们被问:
+“部门是哪些”,所以我们需要column=[部门]
+”访问次数大于1k的“,所以我们需要column=[访问次数]
+基于table和columns,可能的cell values 是 = [1000]。""",
+ "schema_links":"""["部门", "访问次数", 1000]""",
+ "sql":"""select 部门 from 内容库产品 where 访问次数 > 1000 and 数据日期 = '2023-09-14' """
+ },
+ { "current_date":"2023-09-18",
+ "table_name":"歌曲库",
+ "fields_list":"""["歌曲名", "MPPM歌手ID", "歌曲版本", "歌曲类型", "MPPM歌曲ID", "是否严选窄口径歌曲", "是否严选宽口径歌曲", "是否潮流人歌曲", "超声波歌曲ID", "C音歌曲ID", "C音歌曲MID", "结算播放量", "运营播放量", "分享量", "收藏量", "运营搜播量", "结算搜播量", "拉新用户数", "拉活用户数", "分享率", "结算播放份额", "数据日期"]""",
+ "question":"陈亿训唱的所有的播放量大于20k的孤勇者有哪些",
+ "prior_schema_links":"""['199509'->MPPM歌手ID, '1527123'->MPPM歌曲ID]""",
+ "analysis": """让我们一步一步地思考。在问题“陈亿训唱的所有的播放量大于20k的孤勇者有哪些“中,我们被问:
+“孤勇者有哪些”,所以我们需要column=[歌曲名]
+”播放量大于20k的“,所以我们需要column=[结算播放量]
+”陈亿训唱的“,所以我们需要column=[歌手名]
+基于table和columns,可能的cell values 是 = [20000, '陈亿训', '孤勇者']。""",
+ "schema_links":"""["歌曲名", "结算播放量", "歌手名", 20000, "'陈亿训'", "'孤勇者'"]""",
+ "sql":"""select 歌曲名 from 歌曲库 where 结算播放量 > 20000 and 歌手名 = '陈亿训' and 歌曲名 = '孤勇者' and 数据日期 = '2023-09-18' """
+ },
+ { "current_date":"2023-09-18",
+ "table_name":"歌曲库",
+ "fields_list":"""["歌曲名", "歌曲版本", "歌手名", "歌曲类型", "发布时间", "MPPM歌曲ID", "是否严选窄口径歌曲", "是否严选宽口径歌曲", "是否潮流人歌曲", "超声波歌曲ID", "C音歌曲ID", "C音歌曲MID", "结算播放量", "运营播放量", "分享量", "收藏量", "运营搜播量", "结算搜播量", "拉新用户数", "拉活用户数", "分享率", "结算播放份额", "数据日期"]""",
+ "question":"周洁轮去年发布的歌曲有哪些",
+ "prior_schema_links":"""['23109'->MPPM歌手ID]""",
+ "analysis": """让我们一步一步地思考。在问题“周洁轮去年发布的歌曲有哪些“中,我们被问:
+“歌曲有哪些”,所以我们需要column=[歌曲名]
+”去年发布的“,所以我们需要column=[发布时间]
+”周洁轮“,所以我们需要column=[歌手名]
+基于table和columns,可能的cell values 是 = ['周洁轮', 1]。""",
+ "schema_links":"""["歌曲名", "发布时间", "歌手名", 1, "'周洁轮'"]""",
+ "sql":"""select 歌曲名 from 歌曲库 where datediff('year', 发布时间, '2023-09-18') <= 1 and 歌手名 = '周洁轮' and 数据日期 = '2023-09-18' """
+ },
+ { "current_date":"2023-09-11",
+ "table_name":"艺人库",
+ "fields_list":"""["播放量层级", "播放量单调性", "播放量方差", "播放量突增类型", "播放量集中度", "歌手名", "歌手等级", "歌手类型", "歌手来源", "签约日期", "MPPM潮流人等级", "结算播放量", "运营播放量", "历史累计结算播放量", "有播放量歌曲数", "历史累计运营播放量", "付费用户结算播放量", "结算播放量占比", "运营播放份额", "免费用户结算播放占比", "完播量", "数据日期"]""",
+ "question":"我想要近半年签约的播放量前十的歌手有哪些",
+ "prior_schema_links":"""[]""",
+ "analysis": """让我们一步一步地思考。在问题“我想要近半年签约的播放量前十的歌手“中,我们被问:
+“歌手有哪些”,所以我们需要column=[歌手名]
+”播放量前十的“,所以我们需要column=[结算播放量]
+”近半年签约的“,所以我们需要column=[签约日期]
+基于table和columns,可能的cell values 是 = [0.5, 10]。""",
+ "schema_links":"""["歌手名", "结算播放量", "签约日期", 0.5, 10]""",
+ "sql":"""select 歌手名 from 艺人库 where datediff('year', 签约日期, '2023-09-11') <= 0.5 and 数据日期 = '2023-09-11' order by 结算播放量 desc limit 10"""
+ },
+ { "current_date":"2023-08-12",
+ "table_name":"歌曲库",
+ "fields_list": """["发行日期", "歌曲语言", "歌曲来源", "歌曲流派", "歌曲名", "歌曲版本", "歌曲类型", "发行时间", "数据日期"]""",
+ "question":"最近一年发行的歌曲中,有哪些在近7天播放超过一千万的",
+ "prior_schema_links":"""[]""",
+ "analysis": """让我们一步一步地思考。在问题“最近一年发行的歌曲中,有哪些在近7天播放超过一千万的“中,我们被问:
+“发行的歌曲中,有哪些”,所以我们需要column=[歌曲名]
+”最近一年发行的“,所以我们需要column=[发行日期]
+”在近7天播放超过一千万的“,所以我们需要column=[数据日期, 结算播放量]
+基于table和columns,可能的cell values 是 = [1, 10000000]""",
+ "schema_links":"""["歌曲名", "发行日期", "数据日期", "结算播放量", 1, 10000000]""",
+ "sql":"""select 歌曲名 from 歌曲库 where datediff('year', 发行日期, '2023-08-12') <= 1 and datediff('day', 数据日期, '2023-08-12') <= 7 and 结算播放量 > 10000000"""
+ },
+ { "current_date":"2023-08-12",
+ "table_name":"歌曲库",
+ "fields_list": """["发行日期", "歌曲语言", "歌曲来源", "歌曲流派", "歌曲名", "歌曲版本", "歌曲类型", "发行时间", "数据日期"]""",
+ "question":"今年以来发行的歌曲中,有哪些在近7天播放超过一千万的",
+ "prior_schema_links":"""[]""",
+ "analysis": """让我们一步一步地思考。在问题“今年以来发行的歌曲中,有哪些在近7天播放超过一千万的“中,我们被问:
+“发行的歌曲中,有哪些”,所以我们需要column=[歌曲名]
+”今年以来发行的“,所以我们需要column=[发行日期]
+”在近7天播放超过一千万的“,所以我们需要column=[数据日期, 结算播放量]
+基于table和columns,可能的cell values 是 = [0, 7, 10000000]""",
+ "schema_links":"""["歌曲名", "发行日期", "数据日期", "结算播放量", 0, 7, 10000000]""",
+ "sql":"""select 歌曲名 from 歌曲库 where datediff('year', 发行日期, '2023-08-12') <= 0 and datediff('day', 数据日期, '2023-08-12') <= 7 and 结算播放量 > 10000000"""
+ },
+ { "current_date":"2023-08-12",
+ "table_name":"歌曲库",
+ "fields_list": """["发行日期", "歌曲语言", "歌曲来源", "歌曲流派", "歌曲名", "歌曲版本", "歌曲类型", "发行时间", "数据日期"]""",
+ "question":"2023年以来发行的歌曲中,有哪些在近7天播放超过一千万的",
+ "prior_schema_links":"""['514129144'->MPPM歌曲ID]""",
+ "analysis": """让我们一步一步地思考。在问题“2023年以来发行的歌曲中,有哪些在近7天播放超过一千万的“中,我们被问:
+“发行的歌曲中,有哪些”,所以我们需要column=[歌曲名]
+”2023年以来发行的“,所以我们需要column=[发行日期]
+”在近7天播放超过一千万的“,所以我们需要column=[数据日期, 结算播放量]
+基于table和columns,可能的cell values 是 = [2023, 7, 10000000]""",
+ "schema_links":"""["歌曲名", "发行日期", "数据日期", "结算播放量", 2023, 7, 10000000]""",
+ "sql":"""select 歌曲名 from 歌曲库 where YEAR(发行日期) >= 2023 and datediff('day', 数据日期, '2023-08-12') <= 7 and 结算播放量 > 10000000"""
+ },
+ { "current_date":"2023-08-01",
+ "table_name":"歌曲库",
+ "fields_list":"""["歌曲名", "歌曲版本", "歌手名", "歌曲类型", "发布时间", "MPPM歌曲ID", "是否严选窄口径歌曲", "是否严选宽口径歌曲", "是否潮流人歌曲", "超声波歌曲ID", "C音歌曲ID", "C音歌曲MID", "结算播放量", "运营播放量", "分享量", "收藏量", "运营搜播量", "结算搜播量", "拉新用户数", "拉活用户数", "分享率", "结算播放份额", "数据日期"]""",
+ "question":"周洁轮2023年6月之后发布的歌曲有哪些",
+ "prior_schema_links":"""['23109'->MPPM歌手ID]""",
+ "analysis": """让我们一步一步地思考。在问题“周洁轮2023年6月之后发布的歌曲有哪些“中,我们被问:
+“歌曲有哪些”,所以我们需要column=[歌曲名]
+”2023年6月之后发布的“,所以我们需要column=[发布时间]
+”周洁轮“,所以我们需要column=[歌手名]
+基于table和columns,可能的cell values 是 = ['周洁轮', 2023, 6]。""",
+ "schema_links":"""["歌曲名", "发布时间", "歌手名", "周洁轮", 2023, 6]""",
+ "sql":"""select 歌曲名 from 歌曲库 where YEAR(发布时间) >= 2023 and MONTH(发布时间) >= 6 and 歌手名 = '周洁轮' and 数据日期 = '2023-08-01' """
+ },
+ { "current_date":"2023-08-01",
+ "table_name":"歌曲库",
+ "fields_list":"""["歌曲名", "歌曲版本", "歌手名", "歌曲类型", "发布时间", "MPPM歌曲ID", "是否严选窄口径歌曲", "是否严选宽口径歌曲", "是否潮流人歌曲", "超声波歌曲ID", "C音歌曲ID", "C音歌曲MID", "结算播放量", "运营播放量", "分享量", "收藏量", "运营搜播量", "结算搜播量", "拉新用户数", "拉活用户数", "分享率", "结算播放份额", "数据日期"]""",
+ "question":"邓梓琦在2023年1月5日之后发布的歌曲中,有哪些播放量大于500W的?",
+ "prior_schema_links":"""['2312311'->MPPM歌手ID]""",
+ "analysis": """让我们一步一步地思考。在问题“邓梓琦在2023年1月5日之后发布的歌曲中,有哪些播放量大于500W的?“中,我们被问:
+“播放量大于500W的”,所以我们需要column=[结算播放量]
+”邓梓琦在2023年1月5日之后发布的“,所以我们需要column=[发布时间]
+”邓梓琦“,所以我们需要column=[歌手名]
+基于table和columns,可能的cell values 是 = ['邓梓琦', 2023, 1, 5, 5000000]。""",
+ "schema_links":"""["结算播放量", "发布时间", "歌手名", "邓梓琦", 2023, 1, 5, 5000000]""",
+ "sql":"""select 歌曲名 from 歌曲库 where YEAR(发布时间) >= 2023 and MONTH(发布时间) >= 1 and DAY(发布时间) >= 5 and 歌手名 = '邓梓琦' and 结算播放量 > 5000000 and 数据日期 = '2023-08-01'"""
+ },
+ { "current_date":"2023-09-17",
+ "table_name":"歌曲库",
+ "fields_list":"""["歌曲名", "歌曲版本", "歌手名", "歌曲类型", "发布时间", "MPPM歌曲ID", "是否严选窄口径歌曲", "是否严选宽口径歌曲", "是否潮流人歌曲", "超声波歌曲ID", "C音歌曲ID", "C音歌曲MID", "结算播放量", "运营播放量", "分享量", "收藏量", "运营搜播量", "结算搜播量", "拉新用户数", "拉活用户数", "分享率", "结算播放份额", "数据日期"]""",
+ "question":"2023年6月以后,张亮英播放量大于200万的歌曲有哪些?",
+ "prior_schema_links":"""['45453'->MPPM歌手ID]""",
+ "analysis": """让我们一步一步地思考。在问题“2023年6月以后,张亮英播放量大于200万的歌曲有哪些?“中,我们被问:
+“播放量大于200万的”,所以我们需要column=[结算播放量]
+”2023年6月以后,张亮英“,所以我们需要column=[数据日期, 歌手名]
+”歌曲有哪些“,所以我们需要column=[歌曲名]
+基于table和columns,可能的cell values 是 = ['张亮英', 2023, 6, 2000000]。""",
+ "schema_links":"""["结算播放量", "数据日期", "歌手名", "张亮英", 2023, 6, 2000000]""",
+ "sql":"""select 歌曲名 from 歌曲库 where YEAR(数据日期) >= 2023 and MONTH(数据日期) >= 6 and 歌手名 = '张亮英' and 结算播放量 > 2000000 """
+ },
+ { "current_date":"2023-08-16",
+ "table_name":"歌曲库",
+ "fields_list":"""["歌曲名", "歌曲版本", "歌手名", "歌曲类型", "发布时间", "MPPM歌曲ID", "是否严选窄口径歌曲", "是否严选宽口径歌曲", "是否潮流人歌曲", "超声波歌曲ID", "C音歌曲ID", "C音歌曲MID", "结算播放量", "运营播放量", "分享量", "收藏量", "运营搜播量", "结算搜播量", "拉新用户数", "拉活用户数", "分享率", "结算播放份额", "数据日期"]""",
+ "question":"2021年6月以后发布的李雨纯的播放量大于20万的歌曲有哪些",
+ "prior_schema_links":"""['23109'->MPPM歌手ID]""",
+ "analysis": """让我们一步一步地思考。在问题“2021年6月以后发布的李雨纯的播放量大于20万的歌曲有哪些“中,我们被问:
+“播放量大于20万的”,所以我们需要column=[结算播放量]
+”2021年6月以后发布的“,所以我们需要column=[发布时间]
+”李雨纯“,所以我们需要column=[歌手名]
+基于table和columns,可能的cell values 是 = ['李雨纯', 2021, 6, 200000]。""",
+ "schema_links":"""["结算播放量", "发布时间", "歌手名", "李雨纯", 2021, 6, 200000]""",
+ "sql":"""select 歌曲名 from 歌曲库 where YEAR(发布时间) >= 2021 and MONTH(发布时间) >= 6 and 歌手名 = '李雨纯' and 结算播放量 > 200000 and 数据日期 = '2023-08-16'"""
+ },
+ { "current_date":"2023-08-16",
+ "table_name":"歌曲库",
+ "fields_list":"""["歌曲名", "歌曲版本", "歌手名", "歌曲类型", "发布时间", "MPPM歌曲ID", "是否严选窄口径歌曲", "是否严选宽口径歌曲", "是否潮流人歌曲", "超声波歌曲ID", "C音歌曲ID", "C音歌曲MID", "结算播放量", "运营播放量", "分享量", "收藏量", "运营搜播量", "结算搜播量", "拉新用户数", "拉活用户数", "分享率", "结算播放份额", "数据日期"]""",
+ "question":"刘锝桦在1992年4月2日到2020年5月2日之间发布的播放量大于20万的歌曲有哪些",
+ "prior_schema_links":"""['4234234'->MPPM歌手ID]""",
+ "analysis": """让我们一步一步地思考。在问题“刘锝桦在1992年4月2日到2020年5月2日之间发布的播放量大于20万的歌曲有哪些“中,我们被问:
+“播放量大于20万的”,所以我们需要column=[结算播放量]
+”1992年4月2日到2020年5月2日之间发布的“,所以我们需要column=[发布时间]
+”刘锝桦“,所以我们需要column=[歌手名]
+基于table和columns,可能的cell values 是 = ['刘锝桦', 1992, 4, 2, 2020, 5, 2, 200000]。""",
+ "schema_links":"""["结算播放量", "发布时间", "歌手名", "刘锝桦", 1992, 4, 2, 2020, 5, 2, 200000]""",
+ "sql":"""select 歌曲名 from 歌曲库 where YEAR(发布时间) >= 1992 and MONTH(发布时间) >= 4 and DAY(发布时间) >= 2 and YEAR(发布时间) <= 2020 and MONTH(发布时间) <= 5 and DAY(发布时间) <= 2 and 歌手名 = '刘锝桦' and 结算播放量 > 200000 and 数据日期 = '2023-08-16'"""
+ }
+]
\ No newline at end of file
diff --git a/chat/core/src/main/python/llm/preset_retrieval/run.py b/chat/core/src/main/python/llm/preset_retrieval/run.py
index 9027253bf..dc501b49c 100644
--- a/chat/core/src/main/python/llm/preset_retrieval/run.py
+++ b/chat/core/src/main/python/llm/preset_retrieval/run.py
@@ -8,8 +8,7 @@ from typing import Any, List, Mapping, Optional, Union
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
-import chromadb
-from chromadb.config import Settings
+
from chromadb.api import Collection, Documents, Embeddings
from langchain.llms import OpenAI
@@ -21,13 +20,9 @@ from preset_query_db import (get_ids, add2preset_query_collection,
from util.text2vec import Text2VecEmbeddingFunction
from run_config import CHROMA_DB_PERSIST_PATH, PRESET_QUERY_COLLECTION_NAME
+from util.chromadb_instance import client
-client = chromadb.Client(Settings(
- chroma_db_impl="duckdb+parquet",
- persist_directory=CHROMA_DB_PERSIST_PATH # Optional, defaults to .chromadb/ in the current directory
-))
-
emb_func = Text2VecEmbeddingFunction()
collection = client.get_or_create_collection(name=PRESET_QUERY_COLLECTION_NAME,
@@ -35,6 +30,8 @@ collection = client.get_or_create_collection(name=PRESET_QUERY_COLLECTION_NAME,
metadata={"hnsw:space": "cosine"}
) # Get a collection object from an existing collection, by name. If it doesn't exist, create it.
+print("init_preset_query_collection_size: ", preset_query_collection_size(collection))
+
def preset_query_retrieval_run(collection:Collection, query_texts_list:List[str], n_results:int=5):
retrieval_res = query2preset_query_collection(collection=collection,
diff --git a/chat/core/src/main/python/llm/run_config.py b/chat/core/src/main/python/llm/run_config.py
index 989b44e5a..e2b47b404 100644
--- a/chat/core/src/main/python/llm/run_config.py
+++ b/chat/core/src/main/python/llm/run_config.py
@@ -9,6 +9,7 @@ TEMPERATURE = 0.0
CHROMA_DB_PERSIST_DIR = 'chm_db'
PRESET_QUERY_COLLECTION_NAME = "preset_query_collection"
+TEXT2DSL_COLLECTION_NAME = "text2dsl_collection"
CHROMA_DB_PERSIST_PATH = os.path.join(PROJECT_DIR_PATH, CHROMA_DB_PERSIST_DIR)
diff --git a/chat/core/src/main/python/llm/sql/constructor.py b/chat/core/src/main/python/llm/sql/constructor.py
new file mode 100644
index 000000000..c6f367492
--- /dev/null
+++ b/chat/core/src/main/python/llm/sql/constructor.py
@@ -0,0 +1,53 @@
+# -*- coding:utf-8 -*-
+import os
+import sys
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+
+from langchain.prompts.few_shot import FewShotPromptTemplate
+from langchain.prompts import PromptTemplate
+from langchain.vectorstores import Chroma
+from langchain.embeddings import OpenAIEmbeddings, HuggingFaceEmbeddings
+from langchain.prompts.example_selector import SemanticSimilarityExampleSelector
+
+import chromadb
+from chromadb.config import Settings
+
+from few_shot_example.sql_exampler import examplars as din_sql_examplars
+from util.text2vec import Text2VecEmbeddingFunction, hg_embedding
+from util.chromadb_instance import client as chromadb_client
+
+
+from run_config import TEXT2DSL_COLLECTION_NAME
+
+
+vectorstore = Chroma(collection_name=TEXT2DSL_COLLECTION_NAME,
+ embedding_function=hg_embedding,
+ client=chromadb_client)
+
+example_nums = 15
+
+schema_linking_example_selector = SemanticSimilarityExampleSelector(vectorstore=vectorstore, k=example_nums,
+ input_keys=["question"],
+ example_keys=["table_name", "fields_list", "prior_schema_links", "question", "analysis", "schema_links"])
+
+sql_example_selector = SemanticSimilarityExampleSelector(vectorstore=vectorstore, k=example_nums,
+ input_keys=["question"],
+ example_keys=["question", "current_date", "table_name", "schema_links", "sql"])
+
+if vectorstore._collection.count() > 0:
+ print("examples already in din_sql_vectorstore")
+ print("init din_sql_vectorstore size:", vectorstore._collection.count())
+ if vectorstore._collection.count() < len(din_sql_examplars):
+ print("din_sql_examplars size:", len(din_sql_examplars))
+ vectorstore._collection.delete()
+ print("empty din_sql_vectorstore")
+ for example in din_sql_examplars:
+ schema_linking_example_selector.add_example(example)
+ print("added din_sql_vectorstore size:", vectorstore._collection.count())
+else:
+ for example in din_sql_examplars:
+ schema_linking_example_selector.add_example(example)
+
+ print("added din_sql_vectorstore size:", vectorstore._collection.count())
diff --git a/chat/core/src/main/python/llm/sql/output_parser.py b/chat/core/src/main/python/llm/sql/output_parser.py
index 64df5ba1f..c90388850 100644
--- a/chat/core/src/main/python/llm/sql/output_parser.py
+++ b/chat/core/src/main/python/llm/sql/output_parser.py
@@ -1,15 +1,13 @@
# -*- coding:utf-8 -*-
import re
-
def schema_link_parse(schema_link_output):
- try:
- schema_link_output = schema_link_output.strip()
- pattern = r'Schema_links:(.*)'
- schema_link_output = re.findall(pattern, schema_link_output, re.DOTALL)[
- 0].strip()
- except Exception as e:
- print(e)
- schema_link_output = None
+ try:
+ schema_link_output = schema_link_output.strip()
+ pattern = r'Schema_links:(.*)'
+ schema_link_output = re.findall(pattern, schema_link_output, re.DOTALL)[0].strip()
+ except Exception as e:
+ print(e)
+ schema_link_output = None
- return schema_link_output
+ return schema_link_output
\ No newline at end of file
diff --git a/chat/core/src/main/python/llm/sql/prompt_maker.py b/chat/core/src/main/python/llm/sql/prompt_maker.py
index 6e05f95b3..0cfed83b1 100644
--- a/chat/core/src/main/python/llm/sql/prompt_maker.py
+++ b/chat/core/src/main/python/llm/sql/prompt_maker.py
@@ -1,8 +1,5 @@
# -*- coding:utf-8 -*-
from typing import Any, List, Mapping, Optional, Union
-import requests
-import logging
-import json
import os
import sys
@@ -11,78 +8,68 @@ sys.path.append(os.path.dirname(os.path.abspath(__file__)))
from langchain.prompts import PromptTemplate
from langchain.prompts.few_shot import FewShotPromptTemplate
-from langchain.llms import OpenAI
-
-from few_shot_example.sql_exampler import examplars
-from output_parser import schema_link_parse
-
-
-def schema_linking_prompt_maker(user_query: str, model_name: str,
- fields_list: List[str],
- few_shots_example: str):
- instruction = "# 根据数据库的表结构,找出为每个问题生成SQL查询语句的schema_links\n"
-
- schema_linking_prompt = "Table {table_name}, columns = {fields_list}\n问题:{user_query}\n分析: 让我们一步一步地思考。".format(
- table_name=model_name,
- fields_list=fields_list,
- user_query=user_query)
-
- return instruction + few_shots_example + schema_linking_prompt
+from langchain.prompts.example_selector import SemanticSimilarityExampleSelector
def schema_linking_exampler(user_query: str,
- model_name: str,
- fields_list: List[str]
-) -> str:
- example_prompt_template = PromptTemplate(
- input_variables=["table_name", "fields_list", "question", "analysis",
- "schema_links"],
- template="Table {table_name}, columns = {fields_list}\n问题:{question}\n分析:{analysis} 所以Schema_links是:\nSchema_links:{schema_links}")
+ domain_name: str,
+ fields_list: List[str],
+ prior_schema_links: Mapping[str,str],
+ example_selector: SemanticSimilarityExampleSelector,
+ ) -> str:
- instruction = "# 根据数据库的表结构,找出为每个问题生成SQL查询语句的schema_links"
+ prior_schema_links_str = '['+ ','.join(["""'{}'->{}""".format(k,v) for k,v in prior_schema_links.items()]) + ']'
- schema_linking_prompt = "Table {table_name}, columns = {fields_list}\n问题:{question}\n分析: 让我们一步一步地思考。"
+ example_prompt_template = PromptTemplate(input_variables=["table_name", "fields_list", "prior_schema_links", "question", "analysis", "schema_links"],
+ template="Table {table_name}, columns = {fields_list}, prior_schema_links = {prior_schema_links}\n问题:{question}\n分析:{analysis} 所以Schema_links是:\nSchema_links:{schema_links}")
- schema_linking_example_prompt_template = FewShotPromptTemplate(
- examples=examplars,
- example_prompt=example_prompt_template,
- example_separator="\n\n",
- prefix=instruction,
- input_variables=["table_name", "fields_list", "question"],
- suffix=schema_linking_prompt
- )
+ instruction = "# 根据数据库的表结构,参考先验信息,找出为每个问题生成SQL查询语句的schema_links"
- schema_linking_example_prompt = schema_linking_example_prompt_template.format(
- table_name=model_name,
- fields_list=fields_list,
- question=user_query)
+ schema_linking_prompt = "Table {table_name}, columns = {fields_list}, prior_schema_links = {prior_schema_links}\n问题:{question}\n分析: 让我们一步一步地思考。"
- return schema_linking_example_prompt
+ schema_linking_example_prompt_template = FewShotPromptTemplate(
+ example_selector=example_selector,
+ example_prompt=example_prompt_template,
+ example_separator="\n\n",
+ prefix=instruction,
+ input_variables=["table_name", "fields_list", "prior_schema_links", "question"],
+ suffix=schema_linking_prompt
+ )
+
+ schema_linking_example_prompt = schema_linking_example_prompt_template.format(table_name=domain_name,
+ fields_list=fields_list,
+ prior_schema_links=prior_schema_links_str,
+ question=user_query)
+
+ return schema_linking_example_prompt
def sql_exampler(user_query: str,
- model_name: str,
- schema_link_str: str
-) -> str:
- instruction = "# 根据schema_links为每个问题生成SQL查询语句"
+ domain_name: str,
+ schema_link_str: str,
+ data_date: str,
+ example_selector: SemanticSimilarityExampleSelector,
+ ) -> str:
+
+ instruction = "# 根据schema_links为每个问题生成SQL查询语句"
- sql_example_prompt_template = PromptTemplate(
- input_variables=["question", "table_name", "schema_links", "sql"],
- template="问题:{question}\nTable {table_name}\nSchema_links:{schema_links}\nSQL:{sql}")
+ sql_example_prompt_template = PromptTemplate(input_variables=["question", "current_date", "table_name", "schema_links", "sql"],
+ template="问题:{question}\nCurrent_date:{current_date}\nTable {table_name}\nSchema_links:{schema_links}\nSQL:{sql}")
- sql_prompt = "问题:{question}\nTable {table_name}\nSchema_links:{schema_links}\nSQL:"
+ sql_prompt = "问题:{question}\nCurrent_date:{current_date}\nTable {table_name}\nSchema_links:{schema_links}\nSQL:"
- sql_example_prompt_template = FewShotPromptTemplate(
- examples=examplars,
- example_prompt=sql_example_prompt_template,
- example_separator="\n\n",
- prefix=instruction,
- input_variables=["question", "table_name", "schema_links"],
- suffix=sql_prompt
- )
+ sql_example_prompt_template = FewShotPromptTemplate(
+ example_selector=example_selector,
+ example_prompt=sql_example_prompt_template,
+ example_separator="\n\n",
+ prefix=instruction,
+ input_variables=["question", "current_date", "table_name", "schema_links"],
+ suffix=sql_prompt
+ )
- sql_example_prompt = sql_example_prompt_template.format(question=user_query,
- table_name=model_name,
- schema_links=schema_link_str)
+ sql_example_prompt = sql_example_prompt_template.format(question=user_query,
+ current_date=data_date,
+ table_name=domain_name,
+ schema_links=schema_link_str)
- return sql_example_prompt
+ return sql_example_prompt
diff --git a/chat/core/src/main/python/llm/sql/run.py b/chat/core/src/main/python/llm/sql/run.py
index ea60d7f36..34919799b 100644
--- a/chat/core/src/main/python/llm/sql/run.py
+++ b/chat/core/src/main/python/llm/sql/run.py
@@ -1,6 +1,4 @@
-# -*- coding:utf-8 -*-
-
-from typing import List, Union
+from typing import List, Union, Mapping
import logging
import json
import os
@@ -9,33 +7,54 @@ import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
-from sql.prompt_maker import schema_linking_exampler, schema_link_parse, \
- sql_exampler
+from sql.prompt_maker import schema_linking_exampler, sql_exampler
+from sql.constructor import schema_linking_example_selector, sql_example_selector
+from sql.output_parser import schema_link_parse
from util.llm_instance import llm
-def query2sql(query_text: str, schema: dict):
- print("schema: ", schema)
- model_name = schema['modelName']
- fields_list = schema['fieldNameList']
+def query2sql(query_text: str,
+ schema : Union[dict, None] = None,
+ current_date: str = None,
+ linking: Union[List[Mapping[str, str]], None] = None
+ ):
+
+ print("query_text: ", query_text)
+ print("schema: ", schema)
+ print("current_date: ", current_date)
+ print("prior_schema_links: ", linking)
- schema_linking_prompt = schema_linking_exampler(query_text, model_name,
- fields_list)
- schema_link_output = llm(schema_linking_prompt)
- schema_link_str = schema_link_parse(schema_link_output)
+ if linking is not None:
+ prior_schema_links = {item['fieldValue']:item['fieldName'] for item in linking}
+ else:
+ prior_schema_links = {}
- sql_prompt = sql_exampler(query_text, model_name, schema_link_str)
- sql_output = llm(sql_prompt)
+ model_name = schema['modelName']
+ fields_list = schema['fieldNameList']
- resp = dict()
- resp['query'] = query_text
- resp['model'] = model_name
- resp['fields'] = fields_list
+ schema_linking_prompt = schema_linking_exampler(query_text, model_name, fields_list, prior_schema_links, schema_linking_example_selector)
+ print("schema_linking_prompt->", schema_linking_prompt)
+ schema_link_output = llm(schema_linking_prompt)
+ schema_link_str = schema_link_parse(schema_link_output)
+
+ sql_prompt = sql_exampler(query_text, model_name, schema_link_str, current_date, sql_example_selector)
+ print("sql_prompt->", sql_prompt)
+ sql_output = llm(sql_prompt)
- resp['schemaLinkingOutput'] = schema_link_output
- resp['schemaLinkStr'] = schema_link_str
+ resp = dict()
+ resp['query'] = query_text
+ resp['model'] = model_name
+ resp['fields'] = fields_list
+ resp['priorSchemaLinking'] = linking
+ resp['dataDate'] = current_date
- resp['sqlOutput'] = sql_output
+ resp['schemaLinkingOutput'] = schema_link_output
+ resp['schemaLinkStr'] = schema_link_str
+
+ resp['sqlOutput'] = sql_output
+
+ print("resp: ", resp)
+
+ return resp
- return resp
diff --git a/chat/core/src/main/python/llm/util/chromadb_instance.py b/chat/core/src/main/python/llm/util/chromadb_instance.py
new file mode 100644
index 000000000..f0fe6ce01
--- /dev/null
+++ b/chat/core/src/main/python/llm/util/chromadb_instance.py
@@ -0,0 +1,10 @@
+# -*- coding:utf-8 -*-
+import chromadb
+from chromadb.config import Settings
+
+from run_config import CHROMA_DB_PERSIST_PATH
+
+client = chromadb.Client(Settings(
+ chroma_db_impl="duckdb+parquet",
+ persist_directory=CHROMA_DB_PERSIST_PATH # Optional, defaults to .chromadb/ in the current directory
+))
\ No newline at end of file
diff --git a/docs/images/supersonic_components.png b/docs/images/supersonic_components.png
index 4db266acc..07ebe188d 100644
Binary files a/docs/images/supersonic_components.png and b/docs/images/supersonic_components.png differ
diff --git a/docs/images/wechat_contact.jpeg b/docs/images/wechat_contact.jpeg
new file mode 100644
index 000000000..1d913ecc0
Binary files /dev/null and b/docs/images/wechat_contact.jpeg differ
diff --git a/launchers/chat/src/main/bin/run.sh b/launchers/chat/src/main/bin/run.sh
index 00d517aea..694b87156 100755
--- a/launchers/chat/src/main/bin/run.sh
+++ b/launchers/chat/src/main/bin/run.sh
@@ -1,7 +1,7 @@
#!/usr/bin/env bash
binDir=$(cd "$(dirname "$0")"; pwd)
-baseDir=$(readlink -f $binDir/../)
+baseDir=$(cd "$sbinDir/.." && pwd -P)
libDir=$baseDir/lib
confDir=$baseDir/conf
webDir=$baseDir/webapp
diff --git a/launchers/chat/src/main/bin/service.sh b/launchers/chat/src/main/bin/service.sh
index fef8ff3a7..24c32d098 100755
--- a/launchers/chat/src/main/bin/service.sh
+++ b/launchers/chat/src/main/bin/service.sh
@@ -1,7 +1,7 @@
#!/usr/bin/env bash
binDir=$(cd "$(dirname "$0")"; pwd)
-baseDir=$(readlink -f $binDir/../)
+baseDir=$(cd "$sbinDir/.." && pwd -P)
confDir=$baseDir/conf
source ${baseDir}/bin/env.sh
diff --git a/launchers/semantic/src/main/bin/run.sh b/launchers/semantic/src/main/bin/run.sh
index 00d517aea..79a06ca90 100755
--- a/launchers/semantic/src/main/bin/run.sh
+++ b/launchers/semantic/src/main/bin/run.sh
@@ -1,7 +1,7 @@
#!/usr/bin/env bash
binDir=$(cd "$(dirname "$0")"; pwd)
-baseDir=$(readlink -f $binDir/../)
+baseDir=$(cd "$binDir/.." && pwd -P)
libDir=$baseDir/lib
confDir=$baseDir/conf
webDir=$baseDir/webapp
diff --git a/launchers/semantic/src/main/bin/service.sh b/launchers/semantic/src/main/bin/service.sh
index fef8ff3a7..f3374d01e 100755
--- a/launchers/semantic/src/main/bin/service.sh
+++ b/launchers/semantic/src/main/bin/service.sh
@@ -1,7 +1,7 @@
#!/usr/bin/env bash
binDir=$(cd "$(dirname "$0")"; pwd)
-baseDir=$(readlink -f $binDir/../)
+baseDir=$(cd "$binDir/.." && pwd -P)
confDir=$baseDir/conf
source ${baseDir}/bin/env.sh
diff --git a/launchers/standalone/src/main/bin/run.sh b/launchers/standalone/src/main/bin/run.sh
index 00d517aea..79a06ca90 100755
--- a/launchers/standalone/src/main/bin/run.sh
+++ b/launchers/standalone/src/main/bin/run.sh
@@ -1,7 +1,7 @@
#!/usr/bin/env bash
binDir=$(cd "$(dirname "$0")"; pwd)
-baseDir=$(readlink -f $binDir/../)
+baseDir=$(cd "$binDir/.." && pwd -P)
libDir=$baseDir/lib
confDir=$baseDir/conf
webDir=$baseDir/webapp
diff --git a/launchers/standalone/src/main/bin/service.sh b/launchers/standalone/src/main/bin/service.sh
index fef8ff3a7..f3374d01e 100755
--- a/launchers/standalone/src/main/bin/service.sh
+++ b/launchers/standalone/src/main/bin/service.sh
@@ -1,7 +1,7 @@
#!/usr/bin/env bash
binDir=$(cd "$(dirname "$0")"; pwd)
-baseDir=$(readlink -f $binDir/../)
+baseDir=$(cd "$binDir/.." && pwd -P)
confDir=$baseDir/conf
source ${baseDir}/bin/env.sh
diff --git a/pom.xml b/pom.xml
index 466b96143..0651f854f 100644
--- a/pom.xml
+++ b/pom.xml
@@ -63,7 +63,7 @@
1.23.0
3.2.4
4.5
- 0.7.2
+ 0.7.3