mirror of
https://github.com/tencentmusic/supersonic.git
synced 2025-12-10 11:07:06 +00:00
(improvement)(project) Remove Python module and related Python code. (#1263)
This commit is contained in:
@@ -23,12 +23,7 @@ del temp.txt
|
|||||||
cd %baseDir%
|
cd %baseDir%
|
||||||
|
|
||||||
|
|
||||||
if "%service%"=="%pyllm_service%" (
|
if "%service%"=="webapp" (
|
||||||
echo start installing python modules required by supersonic-pyllm: %pip_path%
|
|
||||||
%pip_path% install -r %projectDir%\headless\python\requirements.txt"
|
|
||||||
echo install python modules success
|
|
||||||
goto :EOF
|
|
||||||
) else if "%service%"=="webapp" (
|
|
||||||
call :buildWebapp
|
call :buildWebapp
|
||||||
tar xvf supersonic-webapp.tar.gz
|
tar xvf supersonic-webapp.tar.gz
|
||||||
move /y supersonic-webapp webapp
|
move /y supersonic-webapp webapp
|
||||||
|
|||||||
@@ -63,12 +63,7 @@ function packageRelease {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#1. build backend services
|
#1. build backend services
|
||||||
if [ "$service" == $PYLLM_SERVICE ]; then
|
if [ "$service" == "webapp" ]; then
|
||||||
echo "start installing python modules required by supersonic-pyllm: ${pip_path}"
|
|
||||||
requirementPath=$projectDir/headless/python/requirements.txt
|
|
||||||
${pip_path} install -r ${requirementPath}
|
|
||||||
echo "install python modules success"
|
|
||||||
elif [ "$service" == "webapp" ]; then
|
|
||||||
buildWebapp
|
buildWebapp
|
||||||
target_path=$projectDir/launchers/$STANDALONE_SERVICE/target/classes
|
target_path=$projectDir/launchers/$STANDALONE_SERVICE/target/classes
|
||||||
tar xvf $projectDir/webapp/supersonic-webapp.tar.gz -C $target_path
|
tar xvf $projectDir/webapp/supersonic-webapp.tar.gz -C $target_path
|
||||||
|
|||||||
@@ -2,8 +2,5 @@ set "sbinDir=%~dp0"
|
|||||||
set "baseDir=%~dp0.."
|
set "baseDir=%~dp0.."
|
||||||
set "buildDir=%baseDir%\build"
|
set "buildDir=%baseDir%\build"
|
||||||
set "main_class=com.tencent.supersonic.StandaloneLauncher"
|
set "main_class=com.tencent.supersonic.StandaloneLauncher"
|
||||||
set "python_path=python"
|
|
||||||
set "pip_path=pip3"
|
|
||||||
set "standalone_service=standalone"
|
set "standalone_service=standalone"
|
||||||
set "pyllm_service=pyllm"
|
|
||||||
set "projectDir=%baseDir%\.."
|
set "projectDir=%baseDir%\.."
|
||||||
|
|||||||
@@ -1,9 +1,6 @@
|
|||||||
#!/usr/bin/env bash
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
# environment parameters
|
# environment parameters
|
||||||
python_path=${PYTHON_PATH:-"python3"}
|
|
||||||
pip_path=${PIP_PATH:-"pip3"}
|
|
||||||
|
|
||||||
sbinDir=$(cd "$(dirname "$0")"; pwd)
|
sbinDir=$(cd "$(dirname "$0")"; pwd)
|
||||||
baseDir=$(cd "$sbinDir/.." && pwd -P)
|
baseDir=$(cd "$sbinDir/.." && pwd -P)
|
||||||
runtimeDir=$baseDir/runtime
|
runtimeDir=$baseDir/runtime
|
||||||
@@ -12,13 +9,8 @@ projectDir=$baseDir/..
|
|||||||
|
|
||||||
readonly CHAT_APP_NAME="supersonic_chat"
|
readonly CHAT_APP_NAME="supersonic_chat"
|
||||||
readonly HEADLESS_APP_NAME="supersonic_headless"
|
readonly HEADLESS_APP_NAME="supersonic_headless"
|
||||||
readonly PYLLM_APP_NAME="supersonic_pyllm"
|
|
||||||
readonly STANDALONE_APP_NAME="supersonic_standalone"
|
readonly STANDALONE_APP_NAME="supersonic_standalone"
|
||||||
|
|
||||||
readonly CHAT_SERVICE="chat"
|
readonly CHAT_SERVICE="chat"
|
||||||
readonly HEADLESS_SERVICE="headless"
|
readonly HEADLESS_SERVICE="headless"
|
||||||
readonly PYLLM_SERVICE="pyllm"
|
readonly STANDALONE_SERVICE="standalone"
|
||||||
readonly STANDALONE_SERVICE="standalone"
|
|
||||||
|
|
||||||
readonly PYLLM_HOST="127.0.0.1"
|
|
||||||
readonly PYLLM_PORT="9092"
|
|
||||||
@@ -11,10 +11,6 @@ if "%service%"=="" (
|
|||||||
set "service=%standalone_service%"
|
set "service=%standalone_service%"
|
||||||
)
|
)
|
||||||
set "model_name=%service%"
|
set "model_name=%service%"
|
||||||
IF "%service%"=="pyllm" (
|
|
||||||
set "llmProxy=PythonLLMProxy"
|
|
||||||
set "model_name=%standalone_service%"
|
|
||||||
)
|
|
||||||
|
|
||||||
cd %baseDir%
|
cd %baseDir%
|
||||||
|
|
||||||
@@ -36,30 +32,15 @@ if "%command%"=="restart" (
|
|||||||
goto :EOF
|
goto :EOF
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
: start
|
: start
|
||||||
if "%service%"=="%pyllm_service%" (
|
|
||||||
call :runPythonService
|
|
||||||
call :runJavaService
|
|
||||||
goto :EOF
|
|
||||||
)
|
|
||||||
call :runJavaService
|
call :runJavaService
|
||||||
goto :EOF
|
goto :EOF
|
||||||
|
|
||||||
|
|
||||||
: stop
|
: stop
|
||||||
call :stopPythonService
|
|
||||||
call :stopJavaService
|
call :stopJavaService
|
||||||
goto :EOF
|
goto :EOF
|
||||||
|
|
||||||
|
|
||||||
: reloadExamples
|
|
||||||
set "pythonRunDir=%baseDir%\pyllm"
|
|
||||||
cd "%pythonRunDir%\sql"
|
|
||||||
start %python_path% examples_reload_run.py
|
|
||||||
goto :EOF
|
|
||||||
|
|
||||||
|
|
||||||
: runJavaService
|
: runJavaService
|
||||||
echo 'java service starting, see logs in logs/'
|
echo 'java service starting, see logs in logs/'
|
||||||
set "libDir=%baseDir%\lib"
|
set "libDir=%baseDir%\lib"
|
||||||
@@ -74,24 +55,6 @@ if "%command%"=="restart" (
|
|||||||
echo 'java service started'
|
echo 'java service started'
|
||||||
goto :EOF
|
goto :EOF
|
||||||
|
|
||||||
|
|
||||||
: runPythonService
|
|
||||||
echo 'python service starting, see logs in pyllm\pyllm.log'
|
|
||||||
set "pythonRunDir=%baseDir%\pyllm"
|
|
||||||
start /B %python_path% %pythonRunDir%\supersonic_pyllm.py > %pythonRunDir%\pyllm.log 2>&1
|
|
||||||
timeout /t 10 >nul
|
|
||||||
echo 'python service started'
|
|
||||||
goto :EOF
|
|
||||||
|
|
||||||
|
|
||||||
: stopPythonService
|
|
||||||
for /f "tokens=2" %%i in ('tasklist ^| findstr /i "python"') do (
|
|
||||||
taskkill /PID %%i /F
|
|
||||||
echo "python service (PID = %%i) is killed."
|
|
||||||
)
|
|
||||||
goto :EOF
|
|
||||||
|
|
||||||
|
|
||||||
: stopJavaService
|
: stopJavaService
|
||||||
for /f "tokens=2" %%i in ('tasklist ^| findstr /i "java"') do (
|
for /f "tokens=2" %%i in ('tasklist ^| findstr /i "java"') do (
|
||||||
taskkill /PID %%i /F
|
taskkill /PID %%i /F
|
||||||
|
|||||||
@@ -10,10 +10,6 @@ if [ -z "$service" ]; then
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
model_name=$service
|
model_name=$service
|
||||||
if [ "$service" == "pyllm" ]; then
|
|
||||||
model_name=${STANDALONE_SERVICE}
|
|
||||||
export llmProxy=PythonLLMProxy
|
|
||||||
fi
|
|
||||||
cd $baseDir
|
cd $baseDir
|
||||||
|
|
||||||
function setMainClass {
|
function setMainClass {
|
||||||
@@ -36,11 +32,6 @@ function setAppName {
|
|||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
|
|
||||||
function reloadExamples {
|
|
||||||
cd $baseDir/pyllm/sql
|
|
||||||
${python_path} examples_reload_run.py
|
|
||||||
}
|
|
||||||
|
|
||||||
function runJavaService {
|
function runJavaService {
|
||||||
javaRunDir=$baseDir
|
javaRunDir=$baseDir
|
||||||
local_app_name=$1
|
local_app_name=$1
|
||||||
@@ -72,49 +63,23 @@ function runJavaService {
|
|||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
|
|
||||||
function runPythonService {
|
|
||||||
pythonRunDir=$baseDir/pyllm
|
|
||||||
cd $pythonRunDir
|
|
||||||
nohup ${python_path} supersonic_pyllm.py > $pythonRunDir/pyllm.log 2>&1 &
|
|
||||||
# add health check
|
|
||||||
for i in {1..10}
|
|
||||||
do
|
|
||||||
echo "pyllm health check attempt $i..."
|
|
||||||
response=$(curl -s http://${PYLLM_HOST}:${PYLLM_PORT}/health)
|
|
||||||
echo "pyllm health check response: $response"
|
|
||||||
status_ok="Healthy"
|
|
||||||
if [[ $response == *$status_ok* ]] ; then
|
|
||||||
echo "pyllm Health check passed."
|
|
||||||
break
|
|
||||||
else
|
|
||||||
if [ "$i" -eq 10 ]; then
|
|
||||||
echo "pyllm Health check failed after 10 attempts."
|
|
||||||
echo "May still downloading model files. Please check pyllm.log in runtime directory."
|
|
||||||
fi
|
|
||||||
echo "Retrying after 5 seconds..."
|
|
||||||
sleep 5
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
}
|
|
||||||
|
|
||||||
function start()
|
function start()
|
||||||
{
|
{
|
||||||
local_app_name=$1
|
local_app_name=$1
|
||||||
|
echo "Starting ${local_app_name}"
|
||||||
pid=$(ps aux |grep ${local_app_name} | grep -v grep | awk '{print $2}')
|
pid=$(ps aux |grep ${local_app_name} | grep -v grep | awk '{print $2}')
|
||||||
if [[ "$pid" == "" ]]; then
|
if [[ "$pid" == "" ]]; then
|
||||||
if [[ ${local_app_name} == $PYLLM_APP_NAME ]]; then
|
|
||||||
runPythonService ${local_app_name}
|
|
||||||
else
|
|
||||||
runJavaService ${local_app_name}
|
runJavaService ${local_app_name}
|
||||||
fi
|
|
||||||
else
|
else
|
||||||
echo "Process (PID = $pid) is running."
|
echo "Process (PID = $pid) is running."
|
||||||
return 1
|
return 1
|
||||||
fi
|
fi
|
||||||
|
echo "Start success"
|
||||||
}
|
}
|
||||||
|
|
||||||
function stop()
|
function stop()
|
||||||
{
|
{
|
||||||
|
echo "Stopping $1"
|
||||||
pid=$(ps aux | grep $1 | grep -v grep | awk '{print $2}')
|
pid=$(ps aux | grep $1 | grep -v grep | awk '{print $2}')
|
||||||
if [[ "$pid" == "" ]]; then
|
if [[ "$pid" == "" ]]; then
|
||||||
echo "Process $1 is not running !"
|
echo "Process $1 is not running !"
|
||||||
@@ -124,51 +89,21 @@ function stop()
|
|||||||
echo "Process (PID = $pid) is killed !"
|
echo "Process (PID = $pid) is killed !"
|
||||||
return 0
|
return 0
|
||||||
fi
|
fi
|
||||||
}
|
echo "Stop success"
|
||||||
|
|
||||||
function reload()
|
|
||||||
{
|
|
||||||
if [[ $1 == $PYLLM_APP_NAME ]]; then
|
|
||||||
reloadExamples
|
|
||||||
fi
|
|
||||||
}
|
}
|
||||||
|
|
||||||
setMainClass
|
setMainClass
|
||||||
setAppName
|
setAppName
|
||||||
case "$command" in
|
case "$command" in
|
||||||
start)
|
start)
|
||||||
if [ "$service" == $PYLLM_SERVICE ]; then
|
|
||||||
echo "Starting $PYLLM_APP_NAME"
|
|
||||||
start $PYLLM_APP_NAME
|
|
||||||
fi
|
|
||||||
echo "Starting ${app_name}"
|
|
||||||
start ${app_name}
|
start ${app_name}
|
||||||
echo "Start success"
|
|
||||||
;;
|
;;
|
||||||
stop)
|
stop)
|
||||||
echo "Stopping $app_name"
|
|
||||||
stop $app_name
|
stop $app_name
|
||||||
echo "Stopping $PYLLM_APP_NAME"
|
|
||||||
stop $PYLLM_APP_NAME
|
|
||||||
echo "Stop success"
|
|
||||||
;;
|
|
||||||
reload)
|
|
||||||
echo "Reloading ${app_name}"
|
|
||||||
reload ${app_name}
|
|
||||||
echo "Reload success"
|
|
||||||
;;
|
;;
|
||||||
restart)
|
restart)
|
||||||
if [ "$service" == $PYLLM_SERVICE ]; then
|
|
||||||
echo "Stopping $PYLLM_APP_NAME"
|
|
||||||
stop $PYLLM_APP_NAME
|
|
||||||
echo "Starting $PYLLM_APP_NAME"
|
|
||||||
start $PYLLM_APP_NAME
|
|
||||||
fi
|
|
||||||
echo "Stopping ${app_name}"
|
|
||||||
stop ${app_name}
|
stop ${app_name}
|
||||||
echo "Starting ${app_name}"
|
|
||||||
start ${app_name}
|
start ${app_name}
|
||||||
echo "Restart success"
|
|
||||||
;;
|
;;
|
||||||
*)
|
*)
|
||||||
echo "Use command {start|stop|restart} to run."
|
echo "Use command {start|stop|restart} to run."
|
||||||
|
|||||||
@@ -20,12 +20,6 @@
|
|||||||
<include>*.jar</include>
|
<include>*.jar</include>
|
||||||
</includes>
|
</includes>
|
||||||
</fileSet>
|
</fileSet>
|
||||||
<fileSet>
|
|
||||||
<directory>${project.basedir}/../../headless/python</directory>
|
|
||||||
<outputDirectory>pyllm</outputDirectory>
|
|
||||||
<fileMode>0777</fileMode>
|
|
||||||
<directoryMode>0755</directoryMode>
|
|
||||||
</fileSet>
|
|
||||||
<fileSet>
|
<fileSet>
|
||||||
<directory>${project.basedir}/../../assembly/bin</directory>
|
<directory>${project.basedir}/../../assembly/bin</directory>
|
||||||
<excludes>
|
<excludes>
|
||||||
|
|||||||
@@ -7,14 +7,10 @@ import com.tencent.supersonic.chat.server.plugin.PluginManager;
|
|||||||
import com.tencent.supersonic.chat.server.plugin.PluginRecallResult;
|
import com.tencent.supersonic.chat.server.plugin.PluginRecallResult;
|
||||||
import com.tencent.supersonic.chat.server.plugin.recognize.PluginRecognizer;
|
import com.tencent.supersonic.chat.server.plugin.recognize.PluginRecognizer;
|
||||||
import com.tencent.supersonic.chat.server.pojo.ChatParseContext;
|
import com.tencent.supersonic.chat.server.pojo.ChatParseContext;
|
||||||
import com.tencent.supersonic.common.config.EmbeddingConfig;
|
|
||||||
import com.tencent.supersonic.common.util.ContextUtils;
|
import com.tencent.supersonic.common.util.ContextUtils;
|
||||||
import dev.langchain4j.store.embedding.Retrieval;
|
import dev.langchain4j.store.embedding.Retrieval;
|
||||||
import dev.langchain4j.store.embedding.RetrieveQueryResult;
|
import dev.langchain4j.store.embedding.RetrieveQueryResult;
|
||||||
import com.tencent.supersonic.headless.chat.utils.ComponentFactory;
|
|
||||||
import com.tencent.supersonic.headless.chat.parser.llm.PythonLLMProxy;
|
|
||||||
import lombok.extern.slf4j.Slf4j;
|
import lombok.extern.slf4j.Slf4j;
|
||||||
import org.apache.commons.lang3.StringUtils;
|
|
||||||
import org.apache.commons.lang3.tuple.Pair;
|
import org.apache.commons.lang3.tuple.Pair;
|
||||||
import org.springframework.util.CollectionUtils;
|
import org.springframework.util.CollectionUtils;
|
||||||
|
|
||||||
@@ -31,10 +27,6 @@ import java.util.stream.Collectors;
|
|||||||
public class EmbeddingRecallRecognizer extends PluginRecognizer {
|
public class EmbeddingRecallRecognizer extends PluginRecognizer {
|
||||||
|
|
||||||
public boolean checkPreCondition(ChatParseContext chatParseContext) {
|
public boolean checkPreCondition(ChatParseContext chatParseContext) {
|
||||||
EmbeddingConfig embeddingConfig = ContextUtils.getBean(EmbeddingConfig.class);
|
|
||||||
if (StringUtils.isBlank(embeddingConfig.getUrl()) && ComponentFactory.getLLMProxy() instanceof PythonLLMProxy) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
List<Plugin> plugins = getPluginList(chatParseContext);
|
List<Plugin> plugins = getPluginList(chatParseContext);
|
||||||
return !CollectionUtils.isEmpty(plugins);
|
return !CollectionUtils.isEmpty(plugins);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,36 +1,21 @@
|
|||||||
package com.tencent.supersonic.chat.server.util;
|
package com.tencent.supersonic.chat.server.util;
|
||||||
|
|
||||||
import com.google.common.collect.Lists;
|
import com.google.common.collect.Lists;
|
||||||
import com.tencent.supersonic.chat.api.pojo.request.SimilarQueryReq;
|
|
||||||
import com.tencent.supersonic.chat.api.pojo.response.SimilarQueryRecallResp;
|
import com.tencent.supersonic.chat.api.pojo.response.SimilarQueryRecallResp;
|
||||||
import com.tencent.supersonic.common.config.EmbeddingConfig;
|
import com.tencent.supersonic.common.config.EmbeddingConfig;
|
||||||
import com.tencent.supersonic.common.service.EmbeddingService;
|
import com.tencent.supersonic.common.service.EmbeddingService;
|
||||||
import dev.langchain4j.data.document.Metadata;
|
|
||||||
import dev.langchain4j.data.segment.TextSegment;
|
|
||||||
import dev.langchain4j.store.embedding.Retrieval;
|
import dev.langchain4j.store.embedding.Retrieval;
|
||||||
import dev.langchain4j.store.embedding.RetrieveQuery;
|
import dev.langchain4j.store.embedding.RetrieveQuery;
|
||||||
import dev.langchain4j.store.embedding.RetrieveQueryResult;
|
import dev.langchain4j.store.embedding.RetrieveQueryResult;
|
||||||
import dev.langchain4j.store.embedding.TextSegmentConvert;
|
|
||||||
import lombok.extern.slf4j.Slf4j;
|
import lombok.extern.slf4j.Slf4j;
|
||||||
import org.apache.commons.collections.CollectionUtils;
|
import org.apache.commons.collections.CollectionUtils;
|
||||||
import org.apache.commons.lang3.StringUtils;
|
|
||||||
import org.springframework.beans.factory.annotation.Autowired;
|
import org.springframework.beans.factory.annotation.Autowired;
|
||||||
import org.springframework.core.ParameterizedTypeReference;
|
|
||||||
import org.springframework.http.HttpEntity;
|
|
||||||
import org.springframework.http.HttpHeaders;
|
|
||||||
import org.springframework.http.HttpMethod;
|
|
||||||
import org.springframework.http.MediaType;
|
|
||||||
import org.springframework.http.ResponseEntity;
|
|
||||||
import org.springframework.stereotype.Component;
|
import org.springframework.stereotype.Component;
|
||||||
import org.springframework.web.client.RestTemplate;
|
|
||||||
import org.springframework.web.util.UriComponentsBuilder;
|
|
||||||
|
|
||||||
import java.net.URI;
|
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Optional;
|
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
@@ -48,28 +33,7 @@ public class SimilarQueryManager {
|
|||||||
this.embeddingConfig = embeddingConfig;
|
this.embeddingConfig = embeddingConfig;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void saveSimilarQuery(SimilarQueryReq similarQueryReq) {
|
|
||||||
if (StringUtils.isBlank(embeddingConfig.getUrl())) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
String queryText = similarQueryReq.getQueryText();
|
|
||||||
try {
|
|
||||||
Map<String, Object> metaData = new HashMap<>();
|
|
||||||
metaData.put("agentId", String.valueOf(similarQueryReq.getAgentId()));
|
|
||||||
TextSegment textSegment = TextSegment.from(queryText, new Metadata(metaData));
|
|
||||||
TextSegmentConvert.addQueryId(textSegment, String.valueOf(similarQueryReq.getQueryId()));
|
|
||||||
|
|
||||||
String solvedQueryCollection = embeddingConfig.getSolvedQueryCollection();
|
|
||||||
embeddingService.addQuery(solvedQueryCollection, Lists.newArrayList(textSegment));
|
|
||||||
} catch (Exception e) {
|
|
||||||
log.warn("save history question to embedding failed, queryText:{}", queryText, e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public List<SimilarQueryRecallResp> recallSimilarQuery(String queryText, Integer agentId) {
|
public List<SimilarQueryRecallResp> recallSimilarQuery(String queryText, Integer agentId) {
|
||||||
if (StringUtils.isBlank(embeddingConfig.getUrl())) {
|
|
||||||
return Lists.newArrayList();
|
|
||||||
}
|
|
||||||
List<SimilarQueryRecallResp> similarQueryRecallResps = Lists.newArrayList();
|
List<SimilarQueryRecallResp> similarQueryRecallResps = Lists.newArrayList();
|
||||||
try {
|
try {
|
||||||
String solvedQueryCollection = embeddingConfig.getSolvedQueryCollection();
|
String solvedQueryCollection = embeddingConfig.getSolvedQueryCollection();
|
||||||
@@ -113,30 +77,4 @@ public class SimilarQueryManager {
|
|||||||
return similarQueryRecallResps.stream()
|
return similarQueryRecallResps.stream()
|
||||||
.limit(embeddingConfig.getSolvedQueryResultNum()).collect(Collectors.toList());
|
.limit(embeddingConfig.getSolvedQueryResultNum()).collect(Collectors.toList());
|
||||||
}
|
}
|
||||||
|
|
||||||
private ResponseEntity<String> doRequest(String path, String jsonBody) {
|
|
||||||
if (StringUtils.isEmpty(embeddingConfig.getUrl())) {
|
|
||||||
return ResponseEntity.of(Optional.empty());
|
|
||||||
}
|
|
||||||
String url = embeddingConfig.getUrl() + path;
|
|
||||||
try {
|
|
||||||
HttpHeaders headers = new HttpHeaders();
|
|
||||||
headers.setContentType(MediaType.APPLICATION_JSON);
|
|
||||||
headers.setLocation(URI.create(url));
|
|
||||||
URI requestUrl = UriComponentsBuilder
|
|
||||||
.fromHttpUrl(url).build().encode().toUri();
|
|
||||||
HttpEntity<String> entity = new HttpEntity<>(jsonBody, headers);
|
|
||||||
log.info("[embedding] request body :{}, url:{}", jsonBody, url);
|
|
||||||
RestTemplate restTemplate = new RestTemplate();
|
|
||||||
ResponseEntity<String> responseEntity = restTemplate.exchange(requestUrl,
|
|
||||||
HttpMethod.POST, entity, new ParameterizedTypeReference<String>() {
|
|
||||||
});
|
|
||||||
log.info("[embedding] result body:{}", responseEntity);
|
|
||||||
return responseEntity;
|
|
||||||
} catch (Exception e) {
|
|
||||||
log.warn("connect to embedding service failed, url:{}", url);
|
|
||||||
}
|
|
||||||
return ResponseEntity.of(Optional.empty());
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -7,13 +7,6 @@ import org.springframework.context.annotation.Configuration;
|
|||||||
@Configuration
|
@Configuration
|
||||||
@Data
|
@Data
|
||||||
public class EmbeddingConfig {
|
public class EmbeddingConfig {
|
||||||
|
|
||||||
@Value("${s2.embedding.url:}")
|
|
||||||
private String url;
|
|
||||||
|
|
||||||
@Value("${s2.embedding.recognize.path:/preset_query_retrival}")
|
|
||||||
private String recognizePath;
|
|
||||||
|
|
||||||
@Value("${s2.embedding.preset.collection:preset_query_collection}")
|
@Value("${s2.embedding.preset.collection:preset_query_collection}")
|
||||||
private String presetCollection;
|
private String presetCollection;
|
||||||
|
|
||||||
|
|||||||
@@ -8,13 +8,6 @@ import org.springframework.context.annotation.Configuration;
|
|||||||
@Configuration
|
@Configuration
|
||||||
@Data
|
@Data
|
||||||
public class LLMParserConfig {
|
public class LLMParserConfig {
|
||||||
|
|
||||||
@Value("${s2.parser.url:}")
|
|
||||||
private String url;
|
|
||||||
|
|
||||||
@Value("${s2.query2sql.path:/query2sql}")
|
|
||||||
private String queryToSqlPath;
|
|
||||||
|
|
||||||
@Value("${s2.recall.max.retries:3}")
|
@Value("${s2.recall.max.retries:3}")
|
||||||
private int recallMaxRetries;
|
private int recallMaxRetries;
|
||||||
|
|
||||||
|
|||||||
@@ -1,61 +0,0 @@
|
|||||||
package com.tencent.supersonic.headless.chat.parser.llm;
|
|
||||||
|
|
||||||
import com.tencent.supersonic.common.util.ContextUtils;
|
|
||||||
import com.tencent.supersonic.common.util.JsonUtil;
|
|
||||||
import com.tencent.supersonic.headless.chat.query.llm.s2sql.LLMReq;
|
|
||||||
import com.tencent.supersonic.headless.chat.query.llm.s2sql.LLMResp;
|
|
||||||
import lombok.extern.slf4j.Slf4j;
|
|
||||||
import org.apache.commons.collections.MapUtils;
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
import org.springframework.http.HttpEntity;
|
|
||||||
import org.springframework.http.HttpHeaders;
|
|
||||||
import org.springframework.http.HttpMethod;
|
|
||||||
import org.springframework.http.MediaType;
|
|
||||||
import org.springframework.http.ResponseEntity;
|
|
||||||
import org.springframework.stereotype.Component;
|
|
||||||
import org.springframework.web.client.RestTemplate;
|
|
||||||
|
|
||||||
import java.net.URL;
|
|
||||||
import java.util.ArrayList;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* PythonLLMProxy sends requests to LangChain-based python service.
|
|
||||||
*/
|
|
||||||
@Slf4j
|
|
||||||
@Component
|
|
||||||
public class PythonLLMProxy implements LLMProxy {
|
|
||||||
|
|
||||||
private static final Logger keyPipelineLog = LoggerFactory.getLogger("keyPipeline");
|
|
||||||
|
|
||||||
public LLMResp text2sql(LLMReq llmReq) {
|
|
||||||
long startTime = System.currentTimeMillis();
|
|
||||||
log.info("requestLLM request, llmReq:{}", llmReq);
|
|
||||||
keyPipelineLog.info("PythonLLMProxy llmReq:{}", llmReq);
|
|
||||||
try {
|
|
||||||
LLMParserConfig llmParserConfig = ContextUtils.getBean(LLMParserConfig.class);
|
|
||||||
|
|
||||||
URL url = new URL(new URL(llmParserConfig.getUrl()), llmParserConfig.getQueryToSqlPath());
|
|
||||||
HttpHeaders headers = new HttpHeaders();
|
|
||||||
headers.setContentType(MediaType.APPLICATION_JSON);
|
|
||||||
HttpEntity<String> entity = new HttpEntity<>(JsonUtil.toString(llmReq), headers);
|
|
||||||
RestTemplate restTemplate = ContextUtils.getBean(RestTemplate.class);
|
|
||||||
ResponseEntity<LLMResp> responseEntity = restTemplate.exchange(url.toString(), HttpMethod.POST, entity,
|
|
||||||
LLMResp.class);
|
|
||||||
|
|
||||||
LLMResp llmResp = responseEntity.getBody();
|
|
||||||
log.info("requestLLM response,cost:{}, questUrl:{} \n entity:{} \n body:{}",
|
|
||||||
System.currentTimeMillis() - startTime, url, entity, llmResp);
|
|
||||||
keyPipelineLog.info("PythonLLMProxy llmResp:{}", llmResp);
|
|
||||||
|
|
||||||
if (MapUtils.isEmpty(llmResp.getSqlRespMap())) {
|
|
||||||
llmResp.setSqlRespMap(OutputFormat.buildSqlRespMap(new ArrayList<>(), llmResp.getSqlWeight()));
|
|
||||||
}
|
|
||||||
return llmResp;
|
|
||||||
} catch (Exception e) {
|
|
||||||
log.error("requestLLM error", e);
|
|
||||||
}
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
@@ -20,9 +20,4 @@ public class LLMResp {
|
|||||||
|
|
||||||
private Map<String, LLMSqlResp> sqlRespMap;
|
private Map<String, LLMSqlResp> sqlRespMap;
|
||||||
|
|
||||||
/**
|
|
||||||
* Only for compatibility with python code, later deleted
|
|
||||||
*/
|
|
||||||
private Map<String, Double> sqlWeight;
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,84 +0,0 @@
|
|||||||
# -*- coding:utf-8 -*-
|
|
||||||
import configparser
|
|
||||||
|
|
||||||
import os
|
|
||||||
import sys
|
|
||||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
|
|
||||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
||||||
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
|
||||||
|
|
||||||
class EnvInterpolation(configparser.BasicInterpolation):
|
|
||||||
"""Interpolation which expands environment variables in values."""
|
|
||||||
|
|
||||||
def before_get(self, parser, section, option, value, defaults):
|
|
||||||
value = super().before_get(parser, section, option, value, defaults)
|
|
||||||
return os.path.expandvars(value)
|
|
||||||
|
|
||||||
def type_convert(input_str: str):
|
|
||||||
try:
|
|
||||||
return eval(input_str)
|
|
||||||
except:
|
|
||||||
return input_str
|
|
||||||
|
|
||||||
|
|
||||||
PROJECT_DIR_PATH = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
|
||||||
config_dir = "config"
|
|
||||||
CONFIG_DIR_PATH = os.path.join(PROJECT_DIR_PATH, config_dir)
|
|
||||||
config_file = "run_config.ini"
|
|
||||||
config_path = os.path.join(CONFIG_DIR_PATH, config_file)
|
|
||||||
|
|
||||||
config = configparser.ConfigParser(interpolation=EnvInterpolation())
|
|
||||||
config.read(config_path)
|
|
||||||
|
|
||||||
log_dir = "log"
|
|
||||||
LOG_DIR_PATH = os.path.join(PROJECT_DIR_PATH, log_dir)
|
|
||||||
log_file = "run.log"
|
|
||||||
LOG_FILE_PATH = os.path.join(LOG_DIR_PATH, log_file)
|
|
||||||
|
|
||||||
llm_parser_section_name = "LLMParser"
|
|
||||||
LLMPARSER_HOST = config.get(llm_parser_section_name, 'LLMPARSER_HOST')
|
|
||||||
LLMPARSER_PORT = int(config.get(llm_parser_section_name, 'LLMPARSER_PORT'))
|
|
||||||
|
|
||||||
chroma_db_section_name = "ChromaDB"
|
|
||||||
CHROMA_DB_PERSIST_DIR = config.get(chroma_db_section_name, 'CHROMA_DB_PERSIST_DIR')
|
|
||||||
PRESET_QUERY_COLLECTION_NAME = config.get(chroma_db_section_name, 'PRESET_QUERY_COLLECTION_NAME')
|
|
||||||
SOLVED_QUERY_COLLECTION_NAME = config.get(chroma_db_section_name, 'SOLVED_QUERY_COLLECTION_NAME')
|
|
||||||
TEXT2DSLAGENT_COLLECTION_NAME = config.get(chroma_db_section_name, 'TEXT2DSLAGENT_COLLECTION_NAME')
|
|
||||||
TEXT2DSLAGENTACT_COLLECTION_NAME = config.get(chroma_db_section_name, 'TEXT2DSLAGENTACT_COLLECTION_NAME')
|
|
||||||
TEXT2DSL_EXAMPLE_NUM = int(config.get(chroma_db_section_name, 'TEXT2DSL_EXAMPLE_NUM'))
|
|
||||||
TEXT2DSL_FEWSHOTS_NUM = int(config.get(chroma_db_section_name, 'TEXT2DSL_FEWSHOTS_NUM'))
|
|
||||||
TEXT2DSL_SELF_CONSISTENCY_NUM = int(config.get(chroma_db_section_name, 'TEXT2DSL_SELF_CONSISTENCY_NUM'))
|
|
||||||
ACT_MIN_WINDOWN_SIZE = int(config.get(chroma_db_section_name, 'ACT_MIN_WINDOWN_SIZE'))
|
|
||||||
ACT_MAX_WINDOWN_SIZE = int(config.get(chroma_db_section_name, 'ACT_MAX_WINDOWN_SIZE'))
|
|
||||||
CHROMA_DB_PERSIST_PATH = os.path.join(PROJECT_DIR_PATH, CHROMA_DB_PERSIST_DIR)
|
|
||||||
|
|
||||||
text2vec_section_name = "Text2Vec"
|
|
||||||
HF_TEXT2VEC_MODEL_NAME = config.get(text2vec_section_name, 'HF_TEXT2VEC_MODEL_NAME')
|
|
||||||
|
|
||||||
llm_provider_section_name = "LLMProvider"
|
|
||||||
LLM_PROVIDER_NAME = config.get(llm_provider_section_name, 'LLM_PROVIDER_NAME')
|
|
||||||
|
|
||||||
llm_model_section_name = "LLMModel"
|
|
||||||
llm_config_dict = {}
|
|
||||||
for option in config.options(llm_model_section_name):
|
|
||||||
llm_config_dict[option] = type_convert(config.get(llm_model_section_name, option))
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
print(f"PROJECT_DIR_PATH: {PROJECT_DIR_PATH}")
|
|
||||||
print(f"EMB_MODEL_PATH: {HF_TEXT2VEC_MODEL_NAME}")
|
|
||||||
print(f"CHROMA_DB_PERSIST_PATH: {CHROMA_DB_PERSIST_PATH}")
|
|
||||||
print(f"LLMPARSER_HOST: {LLMPARSER_HOST}")
|
|
||||||
print(f"LLMPARSER_PORT: {LLMPARSER_PORT}")
|
|
||||||
print(f"llm_config_dict: {llm_config_dict}")
|
|
||||||
print(f"LLM_PROVIDER_NAME: {LLM_PROVIDER_NAME}")
|
|
||||||
print(f"PRESET_QUERY_COLLECTION_NAME: {PRESET_QUERY_COLLECTION_NAME}")
|
|
||||||
print(f"SOLVED_QUERY_COLLECTION_NAME: {SOLVED_QUERY_COLLECTION_NAME}")
|
|
||||||
print(f"TEXT2DSLAGENT_COLLECTION_NAME: {TEXT2DSLAGENT_COLLECTION_NAME}")
|
|
||||||
print(f"TEXT2DSLAGENTACT_COLLECTION_NAME: {TEXT2DSLAGENTACT_COLLECTION_NAME}")
|
|
||||||
print(f"TEXT2DSL_EXAMPLE_NUM: {TEXT2DSL_EXAMPLE_NUM}")
|
|
||||||
print(f"TEXT2DSL_FEWSHOTS_NUM: {TEXT2DSL_FEWSHOTS_NUM}")
|
|
||||||
print(f"TEXT2DSL_SELF_CONSISTENCY_NUM: {TEXT2DSL_SELF_CONSISTENCY_NUM}")
|
|
||||||
print(f"ACT_MIN_WINDOWN_SIZE: {ACT_MIN_WINDOWN_SIZE}")
|
|
||||||
print(f"ACT_MAX_WINDOWN_SIZE: {ACT_MAX_WINDOWN_SIZE}")
|
|
||||||
print(f"LOG_FILE_PATH: {LOG_FILE_PATH}")
|
|
||||||
@@ -1,27 +0,0 @@
|
|||||||
[LLMParser]
|
|
||||||
LLMPARSER_HOST = 127.0.0.1
|
|
||||||
LLMPARSER_PORT = 9092
|
|
||||||
|
|
||||||
[ChromaDB]
|
|
||||||
CHROMA_DB_PERSIST_DIR = chm_db
|
|
||||||
PRESET_QUERY_COLLECTION_NAME = preset_query_collection
|
|
||||||
SOLVED_QUERY_COLLECTION_NAME = solved_query_collection
|
|
||||||
TEXT2DSLAGENT_COLLECTION_NAME = text2dsl_agent_collection
|
|
||||||
TEXT2DSLAGENTACT_COLLECTION_NAME = text2dsl_agent_act_collection
|
|
||||||
TEXT2DSL_EXAMPLE_NUM = 15
|
|
||||||
TEXT2DSL_FEWSHOTS_NUM = 10
|
|
||||||
TEXT2DSL_SELF_CONSISTENCY_NUM = 5
|
|
||||||
ACT_MIN_WINDOWN_SIZE = 6
|
|
||||||
ACT_MAX_WINDOWN_SIZE = 10
|
|
||||||
|
|
||||||
[Text2Vec]
|
|
||||||
HF_TEXT2VEC_MODEL_NAME = GanymedeNil/text2vec-large-chinese
|
|
||||||
|
|
||||||
[LLMProvider]
|
|
||||||
LLM_PROVIDER_NAME = openai
|
|
||||||
|
|
||||||
[LLMModel]
|
|
||||||
OPENAI_API_KEY = ${OPENAI_API_KEY}
|
|
||||||
OPENAI_API_BASE = ${OPENAI_API_BASE}
|
|
||||||
MODEL_NAME = ${OPENAI_MODEL_NAME}
|
|
||||||
TEMPERATURE = ${OPENAI_TEMPERATURE}
|
|
||||||
@@ -1,374 +0,0 @@
|
|||||||
exemplars= [
|
|
||||||
{ "currentDate":"2020-12-01",
|
|
||||||
"tableName":"超音数产品",
|
|
||||||
"fieldsList":"""["部门", "模块", "用户名", "访问次数", "访问人数", "访问时长", "数据日期"]""",
|
|
||||||
"question":"比较jackjchen和robinlee在超音数的访问次数",
|
|
||||||
"priorSchemaLinks":"""['jackjchen'->用户名, 'robinlee'->用户名]""",
|
|
||||||
"analysis": """让我们一步一步地思考。在问题“比较jackjchen和robinlee在超音数的访问次数“中,我们被问:
|
|
||||||
“比较jackjchen和robinlee”,所以我们需要column=[用户名],cell values = ['jackjchen', 'robinlee'],所以有[用户名:('jackjchen', 'robinlee')]
|
|
||||||
”超音数的访问次数“,所以我们需要column=[访问次数]""",
|
|
||||||
"schemaLinks":"""["用户名":("'jackjchen'", "'robinlee'"), "访问次数"]""",
|
|
||||||
"sql":"""SELECT 用户名, 访问次数 FROM 超音数产品 WHERE 用户名 IN ('jackjchen', 'robinlee')"""
|
|
||||||
},
|
|
||||||
{ "currentDate":"2022-11-06",
|
|
||||||
"tableName":"超音数产品",
|
|
||||||
"fieldsList":"""["部门", "模块", "用户名", "访问次数", "访问人数", "访问时长", "数据日期"]""",
|
|
||||||
"question":"超音数近12个月访问人数 按部门",
|
|
||||||
"priorSchemaLinks":"""[]""",
|
|
||||||
"analysis": """让我们一步一步地思考。在问题“超音数近12个月访问人数 按部门“中,我们被问:
|
|
||||||
”超音数近12个月“,所以我们需要column=[数据日期],cell values = [12],所以有[数据日期:(12)]
|
|
||||||
“访问人数”,所以我们需要column=[访问人数]
|
|
||||||
”按部门“,所以我们需要column=[部门]""",
|
|
||||||
"schemaLinks":"""["数据日期":(12), "访问人数", "部门"]""",
|
|
||||||
"sql":"""SELECT 部门, 数据日期, 访问人数 FROM 超音数产品 WHERE datediff('month', 数据日期, '2022-11-06') <= 12 """
|
|
||||||
},
|
|
||||||
{ "currentDate":"2023-04-21",
|
|
||||||
"tableName":"超音数产品",
|
|
||||||
"fieldsList":"""["部门", "模块", "用户名", "访问次数", "访问人数", "访问时长", "数据日期"]""",
|
|
||||||
"question":"超音数美术部、技术研发部的访问时长",
|
|
||||||
"priorSchemaLinks":"""['美术部'->部门, '技术研发部'->部门]""",
|
|
||||||
"analysis": """让我们一步一步地思考。在问题“超音数美术部、技术研发部的访问时长“中,我们被问:
|
|
||||||
“访问时长”,所以我们需要column=[访问时长]
|
|
||||||
”超音数美术部、技术研发部“,所以我们需要column=[部门], cell values = ['美术部', '技术研发部'],所以有[部门:('美术部', '技术研发部')]""",
|
|
||||||
"schemaLinks":"""["访问时长", "部门":("'美术部'", "'技术研发部'")]""",
|
|
||||||
"sql":"""SELECT 部门, 访问时长 FROM 超音数产品 WHERE 部门 IN ('美术部', '技术研发部')"""
|
|
||||||
},
|
|
||||||
{ "currentDate":"2023-08-21",
|
|
||||||
"tableName":"严选",
|
|
||||||
"fieldsList":"""["严选版权归属系", "付费模式", "结算播放份额", "付费用户结算播放份额", "数据日期"]""",
|
|
||||||
"question":"近3天海田飞系MPPM结算播放份额",
|
|
||||||
"priorSchemaLinks":"""['海田飞系'->严选版权归属系]""",
|
|
||||||
"analysis": """让我们一步一步地思考。在问题“近3天海田飞系MPPM结算播放份额“中,我们被问:
|
|
||||||
“MPPM结算播放份额”,所以我们需要column=[结算播放份额],
|
|
||||||
”海田飞系“,所以我们需要column=[严选版权归属系], cell values = ['海田飞系'],所以有[严选版权归属系:('海田飞系')],
|
|
||||||
”近3天“,所以我们需要column=[数据日期], cell values = [3],所以有[数据日期:(3)]""",
|
|
||||||
"schemaLinks":"""["结算播放份额", "严选版权归属系":("'海田飞系'"), "数据日期":(3)]""",
|
|
||||||
"sql":"""SELECT 严选版权归属系, 结算播放份额 FROM 严选 WHERE 严选版权归属系 = '海田飞系' AND datediff('day', 数据日期, '2023-08-21') <= 3 """
|
|
||||||
},
|
|
||||||
{ "currentDate":"2023-05-22",
|
|
||||||
"tableName":"歌曲库",
|
|
||||||
"fieldsList":"""["是否潮流人歌曲", "C音歌曲ID", "C音歌曲MID", "歌曲名", "歌曲版本", "语种", "歌曲类型", "翻唱类型", "MPPM歌曲ID", "是否严选窄口径歌曲", "是否严选宽口径歌曲", "结算播放量", "运营播放量", "付费用户结算播放量", "历史累计结算播放量", "运营搜播量", "结算搜播量", "运营完播量", "运营推播量", "近7日复播率", "日均搜播量", "数据日期"]""",
|
|
||||||
"question":"对比近7天翻唱版和纯音乐的歌曲播放量",
|
|
||||||
"priorSchemaLinks":"""['纯音乐'->语种, '翻唱版'->歌曲版本]""",
|
|
||||||
"analysis": """让我们一步一步地思考。在问题“对比近3天翻唱版和纯音乐的歌曲播放量“中,我们被问:
|
|
||||||
“歌曲播放量”,所以我们需要column=[结算播放量]
|
|
||||||
”翻唱版“,所以我们需要column=[歌曲版本], cell values = ['翻唱版'],所以有[歌曲版本:('翻唱版')]
|
|
||||||
”和纯音乐的歌曲“,所以我们需要column=[语种], cell values = ['纯音乐'],所以有[语种:('纯音乐')]
|
|
||||||
”近7天“,所以我们需要column=[数据日期], cell values = [7],所以有[数据日期:(7)]""",
|
|
||||||
"schemaLinks":"""["结算播放量", "歌曲版本":("'翻唱版'"), "语种":("'纯音乐'"), "数据日期":(7)]""",
|
|
||||||
"sql":"""SELECT 歌曲版本, 语种, 结算播放量 FROM 歌曲库 WHERE 歌曲版本 = '翻唱版' AND 语种 = '纯音乐' AND datediff('day', 数据日期, '2023-05-22') <= 7 """
|
|
||||||
},
|
|
||||||
{ "currentDate":"2023-05-31",
|
|
||||||
"tableName":"艺人库",
|
|
||||||
"fieldsList":"""["上下架状态", "歌手名", "歌手等级", "歌手类型", "歌手来源", "MPPM潮流人等级", "活跃区域", "年龄", "歌手才能", "歌手风格", "粉丝数", "潮音粉丝数", "超声波粉丝数", "推博粉丝数", "超声波歌曲数", "在架歌曲数", "超声波分享数", "独占歌曲数", "超声波在架歌曲评论数", "有播放量歌曲数", "数据日期"]""",
|
|
||||||
"question":"对比一下陈拙悬、孟梅琦、赖媚韵的粉丝数",
|
|
||||||
"priorSchemaLinks":"""['1527896'->MPPM歌手ID, '1565463'->MPPM歌手ID, '2141459'->MPPM歌手ID]""",
|
|
||||||
"analysis": """让我们一步一步地思考。在问题“对比一下陈拙悬、孟梅琦、赖媚韵的粉丝数“中,我们被问:
|
|
||||||
“粉丝数”,所以我们需要column=[粉丝数]
|
|
||||||
”陈拙悬、孟梅琦、赖媚韵“,所以我们需要column=[歌手名], cell values = ['陈拙悬', '孟梅琦', '赖媚韵'],所以有[歌手名:('陈拙悬', '孟梅琦', '赖媚韵')]""",
|
|
||||||
"schemaLinks":"""["粉丝数", "歌手名":("'陈拙悬'", "'孟梅琦'", "'赖媚韵'")]""",
|
|
||||||
"sql":"""SELECT 歌手名, 粉丝数 FROM 艺人库 WHERE 歌手名 IN ('陈拙悬', '孟梅琦', '赖媚韵')"""
|
|
||||||
},
|
|
||||||
{ "currentDate":"2023-07-31",
|
|
||||||
"tableName":"歌曲库",
|
|
||||||
"fieldsList":"""["歌曲名", "歌曲版本", "歌曲类型", "MPPM歌曲ID", "是否严选窄口径歌曲", "是否严选宽口径歌曲", "是否潮流人歌曲", "超声波歌曲ID", "C音歌曲ID", "C音歌曲MID", "结算播放量", "运营播放量", "分享量", "收藏量", "运营搜播量", "结算搜播量", "拉新用户数", "拉活用户数", "分享率", "结算播放份额", "数据日期"]""",
|
|
||||||
"question":"播放量大于1万的歌曲有多少",
|
|
||||||
"priorSchemaLinks":"""[]""",
|
|
||||||
"analysis": """让我们一步一步地思考。在问题“播放量大于1万的歌曲有多少“中,我们被问:
|
|
||||||
“歌曲有多少”,所以我们需要column=[歌曲名]
|
|
||||||
”播放量大于1万的“,所以我们需要column=[结算播放量], cell values = [10000],所以有[结算播放量:(10000)]""",
|
|
||||||
"schemaLinks":"""["歌曲名", "结算播放量":(10000)]""",
|
|
||||||
"sql":"""SELECT 歌曲名 FROM 歌曲库 WHERE 结算播放量 > 10000"""
|
|
||||||
},
|
|
||||||
{ "currentDate":"2023-07-31",
|
|
||||||
"tableName":"超音数产品",
|
|
||||||
"fieldsList":"""["用户名", "部门", "模块", "访问时长", "访问次数", "访问人数", "数据日期"]""",
|
|
||||||
"question":"超音数访问时长小于1小时,且来自美术部的用户是哪些",
|
|
||||||
"priorSchemaLinks":"""['美术部'->部门]""",
|
|
||||||
"analysis": """让我们一步一步地思考。在问题“超音数访问时长小于1小时,且来自美术部的用户是哪些“中,我们被问:
|
|
||||||
“用户是哪些”,所以我们需要column=[用户名]
|
|
||||||
”美术部的“,所以我们需要column=[部门], cell values = ['美术部'],所以有[部门:('美术部')]
|
|
||||||
”访问时长小于1小时“,所以我们需要column=[访问时长], cell values = [1],所以有[访问时长:(1)]""",
|
|
||||||
"schemaLinks":"""["用户名", "部门":("'美术部'"), "访问时长":(1)]""",
|
|
||||||
"sql":"""SELECT 用户名 FROM 超音数产品 WHERE 部门 = '美术部' AND 访问时长 < 1"""
|
|
||||||
},
|
|
||||||
{ "currentDate":"2023-08-31",
|
|
||||||
"tableName":"超音数产品",
|
|
||||||
"fieldsList":"""["用户名", "部门", "模块", "访问时长", "访问次数", "访问人数", "数据日期"]""",
|
|
||||||
"question":"超音数pv最高的用户有哪些",
|
|
||||||
"priorSchemaLinks":"""[]""",
|
|
||||||
"analysis": """让我们一步一步地思考。在问题“超音数pv最高的用户有哪些“中,我们被问:
|
|
||||||
“用户有哪些”,所以我们需要column=[用户名]
|
|
||||||
”pv最高的“,所以我们需要column=[访问次数], cell values = [1],所以有[访问次数:(1)]""",
|
|
||||||
"schemaLinks":"""["用户名", "访问次数":(1)]""",
|
|
||||||
"sql":"""SELECT 用户名 FROM 超音数产品 ORDER BY 访问次数 DESC LIMIT 1"""
|
|
||||||
},
|
|
||||||
{ "currentDate":"2023-08-31",
|
|
||||||
"tableName":"艺人库",
|
|
||||||
"fieldsList":"""["播放量层级", "播放量单调性", "播放量方差", "播放量突增类型", "播放量集中度", "歌手名", "歌手等级", "歌手类型", "歌手来源", "MPPM潮流人等级", "结算播放量", "运营播放量", "历史累计结算播放量", "有播放量歌曲数", "历史累计运营播放量", "付费用户结算播放量", "结算播放量占比", "运营播放份额", "免费用户结算播放占比", "完播量", "数据日期"]""",
|
|
||||||
"question":"近90天袁亚伟播放量平均值是多少",
|
|
||||||
"priorSchemaLinks":"""['152789226'->MPPM歌手ID]""",
|
|
||||||
"analysis": """让我们一步一步地思考。在问题“近90天袁亚伟播放量平均值是多少“中,我们被问:
|
|
||||||
“播放量平均值是多少”,所以我们需要column=[结算播放量]
|
|
||||||
”袁亚伟“,所以我们需要column=[歌手名], cell values = ['袁亚伟'],所以有[歌手名:('袁亚伟')]
|
|
||||||
”近90天“,所以我们需要column=[数据日期], cell values = [90],所以有[数据日期:(90)]""",
|
|
||||||
"schemaLinks":"""["结算播放量", "歌手名":("'袁亚伟'"), "数据日期":(90)]""",
|
|
||||||
"sql":"""SELECT AVG(结算播放量) FROM 艺人库 WHERE 歌手名 = '袁亚伟' AND datediff('day', 数据日期, '2023-08-31') <= 90 """
|
|
||||||
},
|
|
||||||
{ "currentDate":"2023-08-31",
|
|
||||||
"tableName":"艺人库",
|
|
||||||
"fieldsList":"""["播放量层级", "播放量单调性", "播放量方差", "播放量突增类型", "播放量集中度", "歌手名", "歌手等级", "歌手类型", "歌手来源", "MPPM潮流人等级", "结算播放量", "运营播放量", "历史累计结算播放量", "有播放量歌曲数", "历史累计运营播放量", "付费用户结算播放量", "结算播放量占比", "运营播放份额", "免费用户结算播放占比", "完播量", "数据日期"]""",
|
|
||||||
"question":"周倩倩近7天结算播放量总和是多少",
|
|
||||||
"priorSchemaLinks":"""['199509'->MPPM歌手ID]""",
|
|
||||||
"analysis": """让我们一步一步地思考。在问题“周倩倩近7天结算播放量总和是多少“中,我们被问:
|
|
||||||
“结算播放量总和是多少”,所以我们需要column=[结算播放量]
|
|
||||||
”周倩倩“,所以我们需要column=[歌手名], cell values = ['周倩倩'],所以有[歌手名:('周倩倩')]
|
|
||||||
”近7天“,所以我们需要column=[数据日期], cell values = [7],所以有[数据日期:(7)]""",
|
|
||||||
"schemaLinks":"""["结算播放量", "歌手名":("'周倩倩'"), "数据日期":(7)]""",
|
|
||||||
"sql":"""SELECT SUM(结算播放量) FROM 艺人库 WHERE 歌手名 = '周倩倩' AND datediff('day', 数据日期, '2023-08-31') <= 7 """
|
|
||||||
},
|
|
||||||
{ "currentDate":"2023-09-14",
|
|
||||||
"tableName":"超音数产品",
|
|
||||||
"fieldsList":"""["部门", "模块", "用户名", "访问次数", "访问人数", "访问时长", "数据日期"]""",
|
|
||||||
"question":"超音数访问次数大于1k的部门是哪些",
|
|
||||||
"priorSchemaLinks":"""[]""",
|
|
||||||
"analysis": """让我们一步一步地思考。在问题“超音数访问次数大于1k的部门是哪些“中,我们被问:
|
|
||||||
“部门是哪些”,所以我们需要column=[部门]
|
|
||||||
”访问次数大于1k的“,所以我们需要column=[访问次数], cell values = [1000],所以有[访问次数:(1000)]""",
|
|
||||||
"schemaLinks":"""["部门", "访问次数":(1000)]""",
|
|
||||||
"sql":"""SELECT 部门 FROM 超音数产品 WHERE 访问次数 > 1000"""
|
|
||||||
},
|
|
||||||
{ "currentDate":"2023-09-18",
|
|
||||||
"tableName":"歌曲库",
|
|
||||||
"fieldsList":"""["歌曲名", "MPPM歌手ID", "歌曲版本", "歌曲类型", "MPPM歌曲ID", "是否严选窄口径歌曲", "是否严选宽口径歌曲", "是否潮流人歌曲", "超声波歌曲ID", "C音歌曲ID", "C音歌曲MID", "结算播放量", "运营播放量", "分享量", "收藏量", "运营搜播量", "结算搜播量", "拉新用户数", "拉活用户数", "分享率", "结算播放份额", "数据日期"]""",
|
|
||||||
"question":"陈亿训唱的所有的播放量大于20k的孤勇者有哪些",
|
|
||||||
"priorSchemaLinks":"""['199509'->MPPM歌手ID, '1527123'->MPPM歌曲ID]""",
|
|
||||||
"analysis": """让我们一步一步地思考。在问题“陈亿训唱的所有的播放量大于20k的孤勇者有哪些“中,我们被问:
|
|
||||||
“孤勇者有哪些”,所以我们需要column=[歌曲名], cell values = ['孤勇者'],所以有[歌曲名:('孤勇者')]
|
|
||||||
”播放量大于20k的“,所以我们需要column=[结算播放量], cell values = [20000],所以有[结算播放量:(20000)]
|
|
||||||
”陈亿训唱的“,所以我们需要column=[歌手名], cell values = ['陈亿训'],所以有[歌手名:('陈亿训')]""",
|
|
||||||
"schemaLinks":"""["歌曲名":("'孤勇者'"), "结算播放量":(20000), "歌手名":("'陈亿训'")]""",
|
|
||||||
"sql":"""SELECT 歌曲名 FROM 歌曲库 WHERE 结算播放量 > 20000 AND 歌手名 = '陈亿训' AND 歌曲名 = '孤勇者'"""
|
|
||||||
},
|
|
||||||
{ "currentDate":"2023-09-18",
|
|
||||||
"tableName":"歌曲库",
|
|
||||||
"fieldsList":"""["歌曲名", "歌曲版本", "歌手名", "歌曲类型", "发布时间", "MPPM歌曲ID", "是否严选窄口径歌曲", "是否严选宽口径歌曲", "是否潮流人歌曲", "超声波歌曲ID", "C音歌曲ID", "C音歌曲MID", "结算播放量", "运营播放量", "分享量", "收藏量", "运营搜播量", "结算搜播量", "拉新用户数", "拉活用户数", "分享率", "结算播放份额", "数据日期"]""",
|
|
||||||
"question":"周洁轮去年发布的歌曲有哪些",
|
|
||||||
"priorSchemaLinks":"""['23109'->MPPM歌手ID]""",
|
|
||||||
"analysis": """让我们一步一步地思考。在问题“周洁轮去年发布的歌曲有哪些“中,我们被问:
|
|
||||||
“歌曲有哪些”,所以我们需要column=[歌曲名]
|
|
||||||
”去年发布的“,所以我们需要column=[发布时间], cell values = [1],所以有[发布时间:(1)]
|
|
||||||
”周洁轮“,所以我们需要column=[歌手名], cell values = ['周洁轮'],所以有[歌手名:('周洁轮')]""",
|
|
||||||
"schemaLinks":"""["歌曲名", "发布时间":(1), "歌手名":("'周洁轮'")]""",
|
|
||||||
"sql":"""SELECT 歌曲名 FROM 歌曲库 WHERE datediff('year', 发布时间, '2023-09-18') <= 1 AND 歌手名 = '周洁轮'"""
|
|
||||||
},
|
|
||||||
{ "currentDate":"2023-09-11",
|
|
||||||
"tableName":"艺人库",
|
|
||||||
"fieldsList":"""["播放量层级", "播放量单调性", "播放量方差", "播放量突增类型", "播放量集中度", "歌手名", "歌手等级", "歌手类型", "歌手来源", "签约日期", "MPPM潮流人等级", "结算播放量", "运营播放量", "历史累计结算播放量", "有播放量歌曲数", "历史累计运营播放量", "付费用户结算播放量", "结算播放量占比", "运营播放份额", "免费用户结算播放占比", "完播量", "数据日期"]""",
|
|
||||||
"question":"我想要近半年签约的播放量前十的歌手有哪些",
|
|
||||||
"priorSchemaLinks":"""[]""",
|
|
||||||
"analysis": """让我们一步一步地思考。在问题“我想要近半年签约的播放量前十的歌手“中,我们被问:
|
|
||||||
“歌手有哪些”,所以我们需要column=[歌手名]
|
|
||||||
”播放量前十的“,所以我们需要column=[结算播放量], cell values = [10],所以有[结算播放量:(10)]
|
|
||||||
”近半年签约的“,所以我们需要column=[签约日期], cell values = [0.5],所以有[签约日期:(0.5)]""",
|
|
||||||
"schemaLinks":"""["歌手名", "结算播放量":(10), "签约日期":(0.5)]""",
|
|
||||||
"sql":"""SELECT 歌手名 FROM 艺人库 WHERE datediff('year', 签约日期, '2023-09-11') <= 0.5 ORDER BY 结算播放量 DESC LIMIT 10"""
|
|
||||||
},
|
|
||||||
{ "currentDate":"2023-08-12",
|
|
||||||
"tableName":"歌曲库",
|
|
||||||
"fieldsList": """["发行日期", "歌曲语言", "歌曲来源", "歌曲流派", "歌曲名", "歌曲版本", "歌曲类型", "发行时间", "数据日期"]""",
|
|
||||||
"question":"最近一年发行的歌曲中,有哪些在近7天播放超过一千万的",
|
|
||||||
"priorSchemaLinks":"""[]""",
|
|
||||||
"analysis": """让我们一步一步地思考。在问题“最近一年发行的歌曲中,有哪些在近7天播放超过一千万的“中,我们被问:
|
|
||||||
“发行的歌曲中,有哪些”,所以我们需要column=[歌曲名]
|
|
||||||
”最近一年发行的“,所以我们需要column=[发行日期], cell values = [1],所以有[发行日期:(1)]
|
|
||||||
”在近7天播放超过一千万的“,所以我们需要column=[数据日期, 结算播放量], cell values = [7, 10000000],所以有[数据日期:(7), 结算播放量:(10000000)]""",
|
|
||||||
"schemaLinks":"""["歌曲名", "发行日期":(1), "数据日期":(7), "结算播放量":(10000000)]""",
|
|
||||||
"sql":"""SELECT 歌曲名 FROM 歌曲库 WHERE datediff('year', 发行日期, '2023-08-12') <= 1 AND datediff('day', 数据日期, '2023-08-12') <= 7 AND 结算播放量 > 10000000"""
|
|
||||||
},
|
|
||||||
{ "currentDate":"2023-08-12",
|
|
||||||
"tableName":"歌曲库",
|
|
||||||
"fieldsList": """["发行日期", "歌曲语言", "歌曲来源", "歌曲流派", "歌曲名", "歌曲版本", "歌曲类型", "发行时间", "数据日期"]""",
|
|
||||||
"question":"今年以来发行的歌曲中,有哪些在近7天播放超过一千万的",
|
|
||||||
"priorSchemaLinks":"""[]""",
|
|
||||||
"analysis": """让我们一步一步地思考。在问题“今年以来发行的歌曲中,有哪些在近7天播放超过一千万的“中,我们被问:
|
|
||||||
“发行的歌曲中,有哪些”,所以我们需要column=[歌曲名]
|
|
||||||
”今年以来发行的“,所以我们需要column=[发行日期], cell values = [0],所以有[发行日期:(0)]
|
|
||||||
”在近7天播放超过一千万的“,所以我们需要column=[数据日期, 结算播放量], cell values = [7, 10000000],所以有[数据日期:(7), 结算播放量:(10000000)]""",
|
|
||||||
"schemaLinks":"""["歌曲名", "发行日期":(0), "数据日期":(7), "结算播放量":(10000000)]""",
|
|
||||||
"sql":"""SELECT 歌曲名 FROM 歌曲库 WHERE datediff('year', 发行日期, '2023-08-12') <= 0 AND datediff('day', 数据日期, '2023-08-12') <= 7 AND 结算播放量 > 10000000"""
|
|
||||||
},
|
|
||||||
{ "currentDate":"2023-08-12",
|
|
||||||
"tableName":"歌曲库",
|
|
||||||
"fieldsList": """["发行日期", "歌曲语言", "歌曲来源", "歌曲流派", "歌曲名", "歌曲版本", "歌曲类型", "发行时间", "数据日期"]""",
|
|
||||||
"question":"2023年以来发行的歌曲中,有哪些在近7天播放超过一千万的",
|
|
||||||
"priorSchemaLinks":"""['514129144'->MPPM歌曲ID]""",
|
|
||||||
"analysis": """让我们一步一步地思考。在问题“2023年以来发行的歌曲中,有哪些在近7天播放超过一千万的“中,我们被问:
|
|
||||||
“发行的歌曲中,有哪些”,所以我们需要column=[歌曲名]
|
|
||||||
”2023年以来发行的“,所以我们需要column=[发行日期], cell values = ['2023-01-01'],所以有[发行日期:('2023-01-01')]
|
|
||||||
”在近7天播放超过一千万的“,所以我们需要column=[数据日期, 结算播放量], cell values = [7, 10000000],所以有[数据日期:(7), 结算播放量:(10000000)]""",
|
|
||||||
"schemaLinks":"""["歌曲名", "发行日期":("'2023-01-01'"), "数据日期":(7), "结算播放量":(10000000)]""",
|
|
||||||
"sql":"""SELECT 歌曲名 FROM 歌曲库 WHERE 发行日期 >= '2023-01-01' AND datediff('day', 数据日期, '2023-08-12') <= 7 AND 结算播放量 > 10000000"""
|
|
||||||
},
|
|
||||||
{ "currentDate":"2023-08-01",
|
|
||||||
"tableName":"歌曲库",
|
|
||||||
"fieldsList":"""["歌曲名", "歌曲版本", "歌手名", "歌曲类型", "发布时间", "MPPM歌曲ID", "是否严选窄口径歌曲", "是否严选宽口径歌曲", "是否潮流人歌曲", "超声波歌曲ID", "C音歌曲ID", "C音歌曲MID", "结算播放量", "运营播放量", "分享量", "收藏量", "运营搜播量", "结算搜播量", "拉新用户数", "拉活用户数", "分享率", "结算播放份额", "数据日期"]""",
|
|
||||||
"question":"周洁轮2023年6月之后发布的歌曲有哪些",
|
|
||||||
"priorSchemaLinks":"""['23109'->MPPM歌手ID]""",
|
|
||||||
"analysis": """让我们一步一步地思考。在问题“周洁轮2023年6月之后发布的歌曲有哪些“中,我们被问:
|
|
||||||
“歌曲有哪些”,所以我们需要column=[歌曲名]
|
|
||||||
”2023年6月之后发布的“,所以我们需要column=[发布时间], cell values = ['2023-06-01'],所以有[发布时间:('2023-06-01')]
|
|
||||||
”周洁轮“,所以我们需要column=[歌手名], cell values = ['周洁轮'],所以有[歌手名:('周洁轮')]""",
|
|
||||||
"schemaLinks":"""["歌曲名", "发布时间":("'2023-06-01'"), "歌手名":("'周洁轮'")]""",
|
|
||||||
"sql":"""SELECT 歌曲名 FROM 歌曲库 WHERE 发布时间 >= '2023-06-01' AND 歌手名 = '周洁轮'"""
|
|
||||||
},
|
|
||||||
{ "currentDate":"2023-08-01",
|
|
||||||
"tableName":"歌曲库",
|
|
||||||
"fieldsList":"""["歌曲名", "歌曲版本", "歌手名", "歌曲类型", "发布时间", "MPPM歌曲ID", "是否严选窄口径歌曲", "是否严选宽口径歌曲", "是否潮流人歌曲", "超声波歌曲ID", "C音歌曲ID", "C音歌曲MID", "结算播放量", "运营播放量", "分享量", "收藏量", "运营搜播量", "结算搜播量", "拉新用户数", "拉活用户数", "分享率", "结算播放份额", "数据日期"]""",
|
|
||||||
"question":"邓梓琦在2023年1月5日之后发布的歌曲中,有哪些播放量大于500W的?",
|
|
||||||
"priorSchemaLinks":"""['2312311'->MPPM歌手ID]""",
|
|
||||||
"analysis": """让我们一步一步地思考。在问题“邓梓琦在2023年1月5日之后发布的歌曲中,有哪些播放量大于500W的?“中,我们被问:
|
|
||||||
“歌曲中,有哪些”,所以我们需要column=[歌曲名]
|
|
||||||
“播放量大于500W的”,所以我们需要column=[结算播放量], cell values = [5000000],所以有[结算播放量:(5000000)]
|
|
||||||
”邓梓琦在2023年1月5日之后发布的“,所以我们需要column=[发布时间], cell values = ['2023-01-05'],所以有[发布时间:('2023-01-05')]
|
|
||||||
”邓梓琦“,所以我们需要column=[歌手名], cell values = ['邓梓琦'],所以有[歌手名:('邓梓琦')]""",
|
|
||||||
"schemaLinks":"""["歌曲名", "结算播放量":(5000000), "发布时间":("'2023-01-05'"), "歌手名":("'邓梓琦'")]""",
|
|
||||||
"sql":"""SELECT 歌曲名 FROM 歌曲库 WHERE 发布时间 >= '2023-01-05' AND 歌手名 = '邓梓琦' AND 结算播放量 > 5000000"""
|
|
||||||
},
|
|
||||||
{ "currentDate":"2023-09-17",
|
|
||||||
"tableName":"歌曲库",
|
|
||||||
"fieldsList":"""["歌曲名", "歌曲版本", "歌手名", "歌曲类型", "发布时间", "MPPM歌曲ID", "是否严选窄口径歌曲", "是否严选宽口径歌曲", "是否潮流人歌曲", "超声波歌曲ID", "C音歌曲ID", "C音歌曲MID", "结算播放量", "运营播放量", "分享量", "收藏量", "运营搜播量", "结算搜播量", "拉新用户数", "拉活用户数", "分享率", "结算播放份额", "数据日期"]""",
|
|
||||||
"question":"2023年6月以后,张亮英播放量大于200万的歌曲有哪些?",
|
|
||||||
"priorSchemaLinks":"""['45453'->MPPM歌手ID]""",
|
|
||||||
"analysis": """让我们一步一步地思考。在问题“2023年6月以后,张亮英播放量大于200万的歌曲有哪些?“中,我们被问:
|
|
||||||
“播放量大于200万的”,所以我们需要column=[结算播放量], cell values = [2000000],所以有[结算播放量:(2000000)]
|
|
||||||
”2023年6月以后,张亮英“,所以我们需要column=[数据日期, 歌手名], cell values = ['2023-06-01', '张亮英'],所以有[数据日期:('2023-06-01'), 歌手名:('张亮英')],
|
|
||||||
”歌曲有哪些“,所以我们需要column=[歌曲名]""",
|
|
||||||
"schemaLinks":"""["结算播放量":(2000000), "数据日期":("'2023-06-01'"), "歌手名":("'张亮英'"), "歌曲名"]""",
|
|
||||||
"sql":"""SELECT 歌曲名 FROM 歌曲库 WHERE 数据日期 >= '2023-06-01' AND 歌手名 = '张亮英' AND 结算播放量 > 2000000"""
|
|
||||||
},
|
|
||||||
{ "currentDate":"2023-08-16",
|
|
||||||
"tableName":"歌曲库",
|
|
||||||
"fieldsList":"""["歌曲名", "歌曲版本", "歌手名", "歌曲类型", "发布时间", "MPPM歌曲ID", "是否严选窄口径歌曲", "是否严选宽口径歌曲", "是否潮流人歌曲", "超声波歌曲ID", "C音歌曲ID", "C音歌曲MID", "结算播放量", "运营播放量", "分享量", "收藏量", "运营搜播量", "结算搜播量", "拉新用户数", "拉活用户数", "分享率", "结算播放份额", "数据日期"]""",
|
|
||||||
"question":"2021年6月以后发布的李雨纯的播放量大于20万的歌曲有哪些",
|
|
||||||
"priorSchemaLinks":"""['23109'->MPPM歌手ID]""",
|
|
||||||
"analysis": """让我们一步一步地思考。在问题“2021年6月以后发布的李雨纯的播放量大于20万的歌曲有哪些“中,我们被问:
|
|
||||||
“播放量大于20万的”,所以我们需要column=[结算播放量], cell values = [200000],所以有[结算播放量:(200000)]
|
|
||||||
”2021年6月以后发布的“,所以我们需要column=[发布时间], cell values = ['2021-06-01'],所以有[发布时间:('2021-06-01')]
|
|
||||||
”李雨纯“,所以我们需要column=[歌手名], cell values = ['李雨纯'],所以有[歌手名:('李雨纯')]""",
|
|
||||||
"schemaLinks":"""["结算播放量":(200000), "发布时间":("'2021-06-01'"), "歌手名":("'李雨纯'")]""",
|
|
||||||
"sql":"""SELECT 歌曲名 FROM 歌曲库 WHERE 发布时间 >= '2021-06-01' AND 歌手名 = '李雨纯' AND 结算播放量 > 200000"""
|
|
||||||
},
|
|
||||||
{ "currentDate":"2023-08-16",
|
|
||||||
"tableName":"歌曲库",
|
|
||||||
"fieldsList":"""["歌曲名", "歌曲版本", "歌手名", "歌曲类型", "发布时间", "MPPM歌曲ID", "是否严选窄口径歌曲", "是否严选宽口径歌曲", "是否潮流人歌曲", "超声波歌曲ID", "C音歌曲ID", "C音歌曲MID", "结算播放量", "运营播放量", "分享量", "收藏量", "运营搜播量", "结算搜播量", "拉新用户数", "拉活用户数", "分享率", "结算播放份额", "数据日期"]""",
|
|
||||||
"question":"刘锝桦在1992年4月2日到2020年5月2日之间发布的播放量大于20万的歌曲有哪些",
|
|
||||||
"priorSchemaLinks":"""['4234234'->MPPM歌手ID]""",
|
|
||||||
"analysis": """让我们一步一步地思考。在问题“刘锝桦在1992年4月2日到2020年5月2日之间发布的播放量大于20万的歌曲有哪些“中,我们被问:
|
|
||||||
“播放量大于20万的”,所以我们需要column=[结算播放量], cell values = [200000],所以有[结算播放量:(200000)]
|
|
||||||
”1992年4月2日到2020年5月2日之间发布的“, 所以我们需要column=[发布时间], cell values = ['1992-04-02', '2020-05-02'],所以有[发布时间:('1992-04-02', '2020-05-02')]
|
|
||||||
”刘锝桦“,所以我们需要column=[歌手名], cell values = ['刘锝桦'],所以有[歌手名:('刘锝桦')]""",
|
|
||||||
"schemaLinks":"""["结算播放量":(200000), "发布时间":("'1992-04-02'", "'2020-05-02'"), "歌手名":("'刘锝桦'")]""",
|
|
||||||
"sql":"""SELECT 歌曲名 FROM 歌曲库 WHERE 发布时间 >= '1992-04-02' AND 发布时间 <= '2020-05-02' AND 歌手名 = '刘锝桦' AND 结算播放量 > 200000"""
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"currentDate":"2023-09-04",
|
|
||||||
"tableName":"超音数产品",
|
|
||||||
"fieldsList":"""["用户名", "部门", "模块", "访问时长", "访问次数", "访问人数", "数据日期"]""",
|
|
||||||
"question":"超音数近30天访问次数的平均数",
|
|
||||||
"priorSchemaLinks":"""[]""",
|
|
||||||
"analysis": """让我们一步一步地思考。在问题“超音数近30天访问次数的平均数“中,我们被问:
|
|
||||||
“访问次数的平均数”,所以我们需要column=[访问次数]
|
|
||||||
”超音数近30天“,所以我们需要column=[数据日期], cell values = [30],所以有[数据日期:(30)]""",
|
|
||||||
"schemaLinks":"""["访问次数", "数据日期":(30)]""",
|
|
||||||
"sql":"""SELECT AVG(访问次数) FROM 超音数产品 WHERE datediff('day', 数据日期, '2023-09-04') <= 30 """
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"currentDate":"2023-09-04",
|
|
||||||
"tableName":"超音数产品",
|
|
||||||
"fieldsList":"""["用户名", "部门", "模块", "访问时长", "访问次数", "访问人数", "数据日期"]""",
|
|
||||||
"question":"超音数近半年哪个月的访问次数汇总最高",
|
|
||||||
"priorSchemaLinks":"""[]""",
|
|
||||||
"analysis": """让我们一步一步地思考。在问题“超音数近半年哪个月的访问次数汇总最高“中,我们被问:
|
|
||||||
“访问次数汇总最高”,所以我们需要column=[访问次数], cell values = [1],所以有[访问次数:(1)]
|
|
||||||
”超音数近半年“,所以我们需要column=[数据日期], cell values = [0.5],所以有[数据日期:(0.5)]""",
|
|
||||||
"schemaLinks":"""["访问次数":(1), "数据日期":(0.5)]""",
|
|
||||||
"sql":"""SELECT MONTH(数据日期), SUM(访问次数) FROM 超音数产品 WHERE datediff('year', 数据日期, '2023-09-04') <= 0.5 GROUP BY MONTH(数据日期) ORDER BY SUM(访问次数) DESC LIMIT 1"""
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"currentDate":"2023-09-04",
|
|
||||||
"tableName":"超音数产品",
|
|
||||||
"fieldsList":"""["用户名", "部门", "模块", "访问时长", "访问次数", "访问人数", "数据日期"]""",
|
|
||||||
"question":"超音数近半年每个月的平均访问次数",
|
|
||||||
"priorSchemaLinks":"""[]""",
|
|
||||||
"analysis": """让我们一步一步地思考。在问题“超音数近半年每个月的平均访问次数“中,我们被问:
|
|
||||||
“每个月的平均访问次数”,所以我们需要column=[访问次数]
|
|
||||||
”超音数近半年“,所以我们需要column=[数据日期], cell values = [0.5],所以有[数据日期:(0.5)]""",
|
|
||||||
"schemaLinks":"""["访问次数", "数据日期":(0.5)]""",
|
|
||||||
"sql":"""SELECT MONTH(数据日期), AVG(访问次数) FROM 超音数产品 WHERE datediff('year', 数据日期, '2023-09-04') <= 0.5 GROUP BY MONTH(数据日期)"""
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"currentDate":"2023-09-10",
|
|
||||||
"tableName":"超音数产品",
|
|
||||||
"fieldsList":"""["用户名", "部门", "模块", "访问时长", "访问次数", "访问人数", "数据日期"]""",
|
|
||||||
"question":"超音数 按部门统计访问次数 top10 的部门",
|
|
||||||
"priorSchemaLinks":"""[]""",
|
|
||||||
"analysis": """让我们一步一步地思考。在问题“超音数 按部门统计访问次数 top10 的部门“中,我们被问:
|
|
||||||
“访问次数 top10 的部门”,所以我们需要column=[访问次数], cell values = [10],所以有[访问次数:(10)]
|
|
||||||
”超音数 按部门统计“,所以我们需要column=[部门]""",
|
|
||||||
"schemaLinks":"""["访问次数":(10), "部门"]""",
|
|
||||||
"sql":"""SELECT 部门, SUM(访问次数) FROM 超音数产品 GROUP BY 部门 ORDER BY SUM(访问次数) DESC LIMIT 10"""
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"currentDate":"2023-09-10",
|
|
||||||
"tableName":"超音数产品",
|
|
||||||
"fieldsList":"""["用户名", "部门", "模块", "访问时长", "访问次数", "访问人数", "数据日期"]""",
|
|
||||||
"question":"超音速 近7个月,月度总访问量超过 2万的月份",
|
|
||||||
"priorSchemaLinks":"""[]""",
|
|
||||||
"analysis": """让我们一步一步地思考。在问题“超音速 近7个月,月度总访问量超过 2万的月份“中,我们被问:
|
|
||||||
“月度总访问量超过 2万的月份”,所以我们需要column=[访问次数], cell values = [20000],所以有[访问次数:(20000)]
|
|
||||||
”超音速 近7个月“,所以我们需要column=[数据日期], cell values = [7],所以有[数据日期:(7)]""",
|
|
||||||
"schemaLinks":"""["访问次数":(20000), "数据日期":(7)]""",
|
|
||||||
"sql":"""SELECT MONTH(数据日期) FROM 超音数产品 WHERE datediff('month', 数据日期, '2023-09-10') <= 7 GROUP BY MONTH(数据日期) HAVING SUM(访问次数) > 20000"""
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"currentDate":"2023-09-10",
|
|
||||||
"tableName":"歌曲库",
|
|
||||||
"fieldsList":"""["歌曲语言", "歌曲来源", "运营播放量", "播放量", "歌曲名", "结算播放量", "专辑名", "发布日期", "歌曲版本", "歌曲类型", "数据日期"]""",
|
|
||||||
"question":"2022年7月到2023年7月之间发布到歌曲,按播放量取top 100,再按月粒度来统计近1年的运营播放量",
|
|
||||||
"priorSchemaLinks":"""[]""",
|
|
||||||
"analysis": """让我们一步一步地思考。在问题“2022年7月到2023年7月之间发布到歌曲,按播放量取top 100,再按月粒度来统计近1年的运营播放量“中,我们被问:
|
|
||||||
“按月粒度来统计近1年的运营播放量”,所以我们需要column=[运营播放量, 数据日期], cell values = [1],所以有[运营播放量, 数据日期:(1)]
|
|
||||||
”按播放量取top 100“,所以我们需要column=[播放量], cell values = [100],所以有[播放量:(100)]
|
|
||||||
“2022年7月到2023年7月之间发布到歌曲”,所以我们需要column=[发布日期], cell values = ['2022-07-01', '2023-07-01'],所以有[发布日期:('2022-07-01', '2023-07-01')]""",
|
|
||||||
"schemaLinks":"""["运营播放量", "数据日期":(1), "播放量":(100), "发布日期":("'2022-07-01'", "'2023-07-01'")]""",
|
|
||||||
"sql":"""SELECT MONTH(数据日期), SUM(运营播放量) FROM (SELECT 数据日期, 运营播放量 FROM 歌曲库 WHERE 发布日期 >= '2022-07-01' AND 发布日期 <= '2023-07-01' ORDER BY 播放量 DESC LIMIT 100) t WHERE datediff('year', 数据日期, '2023-09-10') <= 1 GROUP BY MONTH(数据日期)"""
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"currentDate":"2023-09-10",
|
|
||||||
"tableName":"歌曲库",
|
|
||||||
"fieldsList":"""["歌曲语言", "歌曲来源", "运营播放量", "播放量", "歌曲名", "结算播放量", "专辑名", "发布日期", "歌曲版本", "歌曲类型", "数据日期"]""",
|
|
||||||
"question":"2022年7月到2023年7月之间发布到歌曲,按播放量取top100,再按月粒度来统计近1年的运营播放量之和,筛选出其中运营播放量之和大于2k的月份",
|
|
||||||
"priorSchemaLinks":"""[]""",
|
|
||||||
"analysis": """让我们一步一步地思考。在问题“2022年7月到2023年7月之间发布到歌曲,按播放量取top100,再按月粒度来统计近1年的运营播放量之和,筛选出其中运营播放量之和大于2k的月份“中,我们被问:
|
|
||||||
“筛选出其中运营播放量之和大于2k的月份”,所以我们需要column=[运营播放量], cell values = [2000],所以有[运营播放量:(2000)]
|
|
||||||
”按月粒度来统计近1年的运营播放量之和“,所以我们需要column=[数据日期], cell values = [1],所以有[数据日期:(1)]
|
|
||||||
”按播放量取top100“,所以我们需要column=[播放量], cell values = [100],所以有[播放量:(100)]
|
|
||||||
”2022年7月到2023年7月之间发布到歌曲“,所以我们需要column=[发布日期], cell values = ['2022-07-01', '2023-07-01'],所以有[发布日期:('2022-07-01', '2023-07-01')]""",
|
|
||||||
"schemaLinks":"""["运营播放量":(2000), "数据日期":(1), "播放量":(100), "发布日期":("'2022-07-01'", "'2023-07-01'")]""",
|
|
||||||
"sql":"""SELECT MONTH(数据日期), SUM(运营播放量) FROM (SELECT 数据日期, 运营播放量 FROM 歌曲库 WHERE 发布日期 >= '2022-07-01' AND 发布日期 <= '2023-07-01' ORDER BY 播放量 DESC LIMIT 100) t WHERE datediff('year', 数据日期, '2023-09-10') <= 1 GROUP BY MONTH(数据日期) HAVING SUM(运营播放量) > 2000"""
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"currentDate":"2023-11-01",
|
|
||||||
"tableName":"营销月模型",
|
|
||||||
"fieldsList":"""["国家中文名", "机型类别", "销量", "数据日期"]""",
|
|
||||||
"question":"今年智能机在哪个国家的销量之和最高",
|
|
||||||
"priorSchemaLinks":"""['智能机'->机型类别]""",
|
|
||||||
"analysis": """让我们一步一步地思考。在问题“今年智能机在哪个国家的销量之和最高“中,我们被问:
|
|
||||||
“销量最高”,所以我们需要column=[销量], cell values = [1],所以有[销量:(1)]
|
|
||||||
”今年“,所以我们需要column=[数据日期], cell values = ['2023-01-01', '2023-11-01'],所以有[数据日期:('2023-01-01', '2023-11-01')]
|
|
||||||
”智能机“,所以我们需要column=[机型类别], cell values = ['智能机'],所以有[机型类别:('智能机')]""",
|
|
||||||
"schemaLinks":"""["销量":(1), "数据日期":("'2023-01-01'", "'2023-11-01'"), "机型类别":("'智能机'")]""",
|
|
||||||
"sql":"""SELECT 国家中文名, SUM(销量) FROM 营销月模型 WHERE 机型类别 = '智能机' AND 数据日期 >= '2023-01-01' AND 数据日期 <= '2023-11-01' GROUP BY 国家中文名 ORDER BY SUM(销量) DESC LIMIT 1"""
|
|
||||||
}
|
|
||||||
]
|
|
||||||
@@ -1,281 +0,0 @@
|
|||||||
[
|
|
||||||
{
|
|
||||||
"question": "比较jackjchen和robinlee在超音数的访问次数",
|
|
||||||
"questionAugmented": "比较jackjchen和robinlee在超音数的访问次数 (补充信息:’'jackjchen'‘是一个’用户名‘,’ 'robinlee'‘是一个’用户名‘。当前的日期是2020-12-01) (备注: )",
|
|
||||||
"modelName": "超音数产品",
|
|
||||||
"dbSchema": "Table: 超音数产品, Columns = [\"部门\", \"模块\", \"用户名\", \"访问次数\", \"访问人数\", \"访问时长\", \"数据日期\"]",
|
|
||||||
"sql": "SELECT 用户名, 访问次数 FROM 超音数产品 WHERE 用户名 IN ('jackjchen', 'robinlee')",
|
|
||||||
"generatedSchemaLinkingCoT": "Let’s think step by step. In the question \"比较jackjchen和robinlee在超音数的访问次数 (补充信息:’'jackjchen'‘是一个’用户名‘,’ 'robinlee'‘是一个’用户名‘。当前的日期是2020-12-01) (备注: )\", we are asked:\n\"的访问次数 \" so we need column = [访问次数]\n\"’用户名‘,\" so we need column = [用户名]\nBased on the tables, columns, and Foreign_keys, The set of possible cell values are = [jackjchen,robinlee]. So the Schema_links are:\nSchema_links: [访问次数,用户名,jackjchen,robinlee]",
|
|
||||||
"generatedSchemaLinkings": "[访问次数,用户名,jackjchen,robinlee]"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"question": "超音数近12个月访问人数 按部门",
|
|
||||||
"questionAugmented": "超音数近12个月访问人数 按部门 (补充信息:。当前的日期是2022-11-06) (备注: )",
|
|
||||||
"modelName": "超音数产品",
|
|
||||||
"dbSchema": "Table: 超音数产品, Columns = [\"部门\", \"模块\", \"用户名\", \"访问次数\", \"访问人数\", \"访问时长\", \"数据日期\"]",
|
|
||||||
"sql": "SELECT 部门, 数据日期, 访问人数 FROM 超音数产品 WHERE datediff('month', 数据日期, '2022-11-06') <= 12 ",
|
|
||||||
"generatedSchemaLinkingCoT": "Let’s think step by step. In the question \"超音数近12个月访问人数 按部门 (补充信息:。当前的日期是2022-11-06) (备注: )\", we are asked:\n\"当前的日期是\" so we need column = [数据日期]\n\" 按部门 (\" so we need column = [部门]\n\"访问人数 按\" so we need column = [访问人数]\nBased on the tables, columns, and Foreign_keys, The set of possible cell values are = [12,month]. So the Schema_links are:\nSchema_links: [数据日期,部门,访问人数,12,month]",
|
|
||||||
"generatedSchemaLinkings": "[数据日期,部门,访问人数,12,month]"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"question": "超音数美术部、技术研发部的访问时长",
|
|
||||||
"questionAugmented": "超音数美术部、技术研发部的访问时长 (补充信息:’'美术部'‘是一个’部门‘,’ '技术研发部'‘是一个’部门‘。当前的日期是2023-04-21) (备注: )",
|
|
||||||
"modelName": "超音数产品",
|
|
||||||
"dbSchema": "Table: 超音数产品, Columns = [\"部门\", \"模块\", \"用户名\", \"访问次数\", \"访问人数\", \"访问时长\", \"数据日期\"]",
|
|
||||||
"sql": "SELECT 部门, 访问时长 FROM 超音数产品 WHERE 部门 IN ('美术部', '技术研发部')",
|
|
||||||
"generatedSchemaLinkingCoT": "Let’s think step by step. In the question \"超音数美术部、技术研发部的访问时长 (补充信息:’'美术部'‘是一个’部门‘,’ '技术研发部'‘是一个’部门‘。当前的日期是2023-04-21) (备注: )\", we are asked:\n\"部门‘,’ \" so we need column = [部门]\n\"的访问时长 \" so we need column = [访问时长]\nBased on the tables, columns, and Foreign_keys, The set of possible cell values are = [美术部,技术研发部]. So the Schema_links are:\nSchema_links: [部门,访问时长,美术部,技术研发部]",
|
|
||||||
"generatedSchemaLinkings": "[部门,访问时长,美术部,技术研发部]"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"question": "近3天海田飞系MPPM结算播放份额",
|
|
||||||
"questionAugmented": "近3天海田飞系MPPM结算播放份额 (补充信息:’'海田飞系'‘是一个’严选版权归属系‘。当前的日期是2023-08-21) (备注: )",
|
|
||||||
"modelName": "严选",
|
|
||||||
"dbSchema": "Table: 严选, Columns = [\"严选版权归属系\", \"付费模式\", \"结算播放份额\", \"付费用户结算播放份额\", \"数据日期\"]",
|
|
||||||
"sql": "SELECT 严选版权归属系, 结算播放份额 FROM 严选 WHERE 严选版权归属系 = '海田飞系' AND datediff('day', 数据日期, '2023-08-21') <= 3 ",
|
|
||||||
"generatedSchemaLinkingCoT": "Let’s think step by step. In the question \"近3天海田飞系MPPM结算播放份额 (补充信息:’'海田飞系'‘是一个’严选版权归属系‘。当前的日期是2023-08-21) (备注: )\", we are asked:\n\"结算播放份额 \" so we need column = [结算播放份额]\n\"当前的日期是\" so we need column = [数据日期]\n\"严选版权归属系\" so we need column = [严选版权归属系]\nBased on the tables, columns, and Foreign_keys, The set of possible cell values are = [海田飞系,3,day]. So the Schema_links are:\nSchema_links: [结算播放份额,数据日期,严选版权归属系,海田飞系,3,day]",
|
|
||||||
"generatedSchemaLinkings": "[结算播放份额,数据日期,严选版权归属系,海田飞系,3,day]"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"question": "对比近7天翻唱版和纯音乐的歌曲播放量",
|
|
||||||
"questionAugmented": "对比近7天翻唱版和纯音乐的歌曲播放量 (补充信息:’'纯音乐'‘是一个’语种‘,’ '翻唱版'‘是一个’歌曲版本‘。当前的日期是2023-05-22) (备注: )",
|
|
||||||
"modelName": "歌曲库",
|
|
||||||
"dbSchema": "Table: 歌曲库, Columns = [\"是否潮流人歌曲\", \"C音歌曲ID\", \"C音歌曲MID\", \"歌曲名\", \"歌曲版本\", \"语种\", \"歌曲类型\", \"翻唱类型\", \"MPPM歌曲ID\", \"是否严选窄口径歌曲\", \"是否严选宽口径歌曲\", \"结算播放量\", \"运营播放量\", \"付费用户结算播放量\", \"历史累计结算播放量\", \"运营搜播量\", \"结算搜播量\", \"运营完播量\", \"运营推播量\", \"近7日复播率\", \"日均搜播量\", \"数据日期\"]",
|
|
||||||
"sql": "SELECT 歌曲版本, 语种, 结算播放量 FROM 歌曲库 WHERE 歌曲版本 = '翻唱版' AND 语种 = '纯音乐' AND datediff('day', 数据日期, '2023-05-22') <= 7 ",
|
|
||||||
"generatedSchemaLinkingCoT": "Let’s think step by step. In the question \"对比近7天翻唱版和纯音乐的歌曲播放量 (补充信息:’'纯音乐'‘是一个’语种‘,’ '翻唱版'‘是一个’歌曲版本‘。当前的日期是2023-05-22) (备注: )\", we are asked:\n\"当前的日期是\" so we need column = [数据日期]\n\"’歌曲版本‘\" so we need column = [歌曲版本]\n\"语种‘,’ \" so we need column = [语种]\n\"曲播放量 (\" so we need column = [结算播放量]\nBased on the tables, columns, and Foreign_keys, The set of possible cell values are = [7,翻唱版,纯音乐,day]. So the Schema_links are:\nSchema_links: [数据日期,歌曲版本,语种,结算播放量,7,翻唱版,纯音乐,day]",
|
|
||||||
"generatedSchemaLinkings": "[数据日期,歌曲版本,语种,结算播放量,7,翻唱版,纯音乐,day]"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"question": "对比一下陈拙悬、孟梅琦、赖媚韵的粉丝数",
|
|
||||||
"questionAugmented": "对比一下陈拙悬、孟梅琦、赖媚韵的粉丝数 (补充信息:’'1527896'‘是一个’MPPM歌手ID‘,’ '1565463'‘是一个’MPPM歌手ID‘,’ '2141459'‘是一个’MPPM歌手ID‘。当前的日期是2023-05-31) (备注: )",
|
|
||||||
"modelName": "艺人库",
|
|
||||||
"dbSchema": "Table: 艺人库, Columns = [\"上下架状态\", \"歌手名\", \"歌手等级\", \"歌手类型\", \"歌手来源\", \"MPPM潮流人等级\", \"活跃区域\", \"年龄\", \"歌手才能\", \"歌手风格\", \"粉丝数\", \"潮音粉丝数\", \"超声波粉丝数\", \"推博粉丝数\", \"超声波歌曲数\", \"在架歌曲数\", \"超声波分享数\", \"独占歌曲数\", \"超声波在架歌曲评论数\", \"有播放量歌曲数\", \"数据日期\"]",
|
|
||||||
"sql": "SELECT 歌手名, 粉丝数 FROM 艺人库 WHERE 歌手名 IN ('陈拙悬', '孟梅琦', '赖媚韵')",
|
|
||||||
"generatedSchemaLinkingCoT": "Let’s think step by step. In the question \"对比一下陈拙悬、孟梅琦、赖媚韵的粉丝数 (补充信息:’'1527896'‘是一个’MPPM歌手ID‘,’ '1565463'‘是一个’MPPM歌手ID‘,’ '2141459'‘是一个’MPPM歌手ID‘。当前的日期是2023-05-31) (备注: )\", we are asked:\n\"歌手ID‘,\" so we need column = [歌手名]\n\"的粉丝数 (\" so we need column = [粉丝数]\nBased on the tables, columns, and Foreign_keys, The set of possible cell values are = [陈拙悬,孟梅琦,赖媚韵]. So the Schema_links are:\nSchema_links: [歌手名,粉丝数,陈拙悬,孟梅琦,赖媚韵]",
|
|
||||||
"generatedSchemaLinkings": "[歌手名,粉丝数,陈拙悬,孟梅琦,赖媚韵]"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"question": "播放量大于1万的歌曲有多少",
|
|
||||||
"questionAugmented": "播放量大于1万的歌曲有多少 (补充信息:。当前的日期是2023-07-31) (备注: )",
|
|
||||||
"modelName": "歌曲库",
|
|
||||||
"dbSchema": "Table: 歌曲库, Columns = [\"歌曲名\", \"歌曲版本\", \"歌曲类型\", \"MPPM歌曲ID\", \"是否严选窄口径歌曲\", \"是否严选宽口径歌曲\", \"是否潮流人歌曲\", \"超声波歌曲ID\", \"C音歌曲ID\", \"C音歌曲MID\", \"结算播放量\", \"运营播放量\", \"分享量\", \"收藏量\", \"运营搜播量\", \"结算搜播量\", \"拉新用户数\", \"拉活用户数\", \"分享率\", \"结算播放份额\", \"数据日期\"]",
|
|
||||||
"sql": "SELECT 歌曲名 FROM 歌曲库 WHERE 结算播放量 > 10000",
|
|
||||||
"generatedSchemaLinkingCoT": "Let’s think step by step. In the question \"播放量大于1万的歌曲有多少 (补充信息:。当前的日期是2023-07-31) (备注: )\", we are asked:\n\"歌曲有多少 \" so we need column = [歌曲名]\n\"播放量大于1\" so we need column = [结算播放量]\nBased on the tables, columns, and Foreign_keys, The set of possible cell values are = [10000]. So the Schema_links are:\nSchema_links: [歌曲名,结算播放量,10000]",
|
|
||||||
"generatedSchemaLinkings": "[歌曲名,结算播放量,10000]"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"question": "超音数访问时长小于1小时,且来自美术部的用户是哪些",
|
|
||||||
"questionAugmented": "超音数访问时长小于1小时,且来自美术部的用户是哪些 (补充信息:’'美术部'‘是一个’部门‘。当前的日期是2023-07-31) (备注: )",
|
|
||||||
"modelName": "超音数产品",
|
|
||||||
"dbSchema": "Table: 超音数产品, Columns = [\"用户名\", \"部门\", \"模块\", \"访问时长\", \"访问次数\", \"访问人数\", \"数据日期\"]",
|
|
||||||
"sql": "SELECT 用户名 FROM 超音数产品 WHERE 部门 = '美术部' AND 访问时长 < 1",
|
|
||||||
"generatedSchemaLinkingCoT": "Let’s think step by step. In the question \"超音数访问时长小于1小时,且来自美术部的用户是哪些 (补充信息:’'美术部'‘是一个’部门‘。当前的日期是2023-07-31) (备注: )\", we are asked:\n\"一个’部门‘\" so we need column = [部门]\n\"访问时长小于\" so we need column = [访问时长]\n\"术部的用户是\" so we need column = [用户名]\nBased on the tables, columns, and Foreign_keys, The set of possible cell values are = [美术部,1]. So the Schema_links are:\nSchema_links: [部门,访问时长,用户名,美术部,1]",
|
|
||||||
"generatedSchemaLinkings": "[部门,访问时长,用户名,美术部,1]"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"question": "超音数pv最高的用户有哪些",
|
|
||||||
"questionAugmented": "超音数pv最高的用户有哪些 (补充信息:。当前的日期是2023-08-31) (备注: )",
|
|
||||||
"modelName": "超音数产品",
|
|
||||||
"dbSchema": "Table: 超音数产品, Columns = [\"用户名\", \"部门\", \"模块\", \"访问时长\", \"访问次数\", \"访问人数\", \"数据日期\"]",
|
|
||||||
"sql": "SELECT 用户名 FROM 超音数产品 ORDER BY 访问次数 DESC LIMIT 1",
|
|
||||||
"generatedSchemaLinkingCoT": "Let’s think step by step. In the question \"超音数pv最高的用户有哪些 (补充信息:。当前的日期是2023-08-31) (备注: )\", we are asked:\n\"最高的用户有\" so we need column = [访问次数]\n\"用户有哪些 (\" so we need column = [用户名]\nBased on the tables, columns, and Foreign_keys, The set of possible cell values are = [1]. So the Schema_links are:\nSchema_links: [访问次数,用户名,1]",
|
|
||||||
"generatedSchemaLinkings": "[访问次数,用户名,1]"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"question": "近90天袁亚伟播放量平均值是多少",
|
|
||||||
"questionAugmented": "近90天袁亚伟播放量平均值是多少 (补充信息:’'152789226'‘是一个’MPPM歌手ID‘。当前的日期是2023-08-31) (备注: )",
|
|
||||||
"modelName": "艺人库",
|
|
||||||
"dbSchema": "Table: 艺人库, Columns = [\"播放量层级\", \"播放量单调性\", \"播放量方差\", \"播放量突增类型\", \"播放量集中度\", \"歌手名\", \"歌手等级\", \"歌手类型\", \"歌手来源\", \"MPPM潮流人等级\", \"结算播放量\", \"运营播放量\", \"历史累计结算播放量\", \"有播放量歌曲数\", \"历史累计运营播放量\", \"付费用户结算播放量\", \"结算播放量占比\", \"运营播放份额\", \"免费用户结算播放占比\", \"完播量\", \"数据日期\"]",
|
|
||||||
"sql": "SELECT AVG(结算播放量) FROM 艺人库 WHERE 歌手名 = '袁亚伟' AND datediff('day', 数据日期, '2023-08-31') <= 90 ",
|
|
||||||
"generatedSchemaLinkingCoT": "Let’s think step by step. In the question \"近90天袁亚伟播放量平均值是多少 (补充信息:’'152789226'‘是一个’MPPM歌手ID‘。当前的日期是2023-08-31) (备注: )\", we are asked:\n\"歌手ID‘。\" so we need column = [歌手名]\n\"当前的日期是\" so we need column = [数据日期]\n\"播放量平均值\" so we need column = [结算播放量]\nBased on the tables, columns, and Foreign_keys, The set of possible cell values are = [袁亚伟,90,day]. So the Schema_links are:\nSchema_links: [歌手名,数据日期,结算播放量,袁亚伟,90,day]",
|
|
||||||
"generatedSchemaLinkings": "[歌手名,数据日期,结算播放量,袁亚伟,90,day]"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"question": "周倩倩近7天结算播放量总和是多少",
|
|
||||||
"questionAugmented": "周倩倩近7天结算播放量总和是多少 (补充信息:’'199509'‘是一个’MPPM歌手ID‘。当前的日期是2023-08-31) (备注: )",
|
|
||||||
"modelName": "艺人库",
|
|
||||||
"dbSchema": "Table: 艺人库, Columns = [\"播放量层级\", \"播放量单调性\", \"播放量方差\", \"播放量突增类型\", \"播放量集中度\", \"歌手名\", \"歌手等级\", \"歌手类型\", \"歌手来源\", \"MPPM潮流人等级\", \"结算播放量\", \"运营播放量\", \"历史累计结算播放量\", \"有播放量歌曲数\", \"历史累计运营播放量\", \"付费用户结算播放量\", \"结算播放量占比\", \"运营播放份额\", \"免费用户结算播放占比\", \"完播量\", \"数据日期\"]",
|
|
||||||
"sql": "SELECT SUM(结算播放量) FROM 艺人库 WHERE 歌手名 = '周倩倩' AND datediff('day', 数据日期, '2023-08-31') <= 7 ",
|
|
||||||
"generatedSchemaLinkingCoT": "Let’s think step by step. In the question \"周倩倩近7天结算播放量总和是多少 (补充信息:’'199509'‘是一个’MPPM歌手ID‘。当前的日期是2023-08-31) (备注: )\", we are asked:\n\"歌手ID‘。\" so we need column = [歌手名]\n\"当前的日期是\" so we need column = [数据日期]\n\"结算播放量总\" so we need column = [结算播放量]\nBased on the tables, columns, and Foreign_keys, The set of possible cell values are = [周倩倩,7,day]. So the Schema_links are:\nSchema_links: [歌手名,数据日期,结算播放量,周倩倩,7,day]",
|
|
||||||
"generatedSchemaLinkings": "[歌手名,数据日期,结算播放量,周倩倩,7,day]"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"question": "超音数访问次数大于1k的部门是哪些",
|
|
||||||
"questionAugmented": "超音数访问次数大于1k的部门是哪些 (补充信息:。当前的日期是2023-09-14) (备注: )",
|
|
||||||
"modelName": "超音数产品",
|
|
||||||
"dbSchema": "Table: 超音数产品, Columns = [\"部门\", \"模块\", \"用户名\", \"访问次数\", \"访问人数\", \"访问时长\", \"数据日期\"]",
|
|
||||||
"sql": "SELECT 部门 FROM 超音数产品 WHERE 访问次数 > 1000",
|
|
||||||
"generatedSchemaLinkingCoT": "Let’s think step by step. In the question \"超音数访问次数大于1k的部门是哪些 (补充信息:。当前的日期是2023-09-14) (备注: )\", we are asked:\n\"部门是哪些 \" so we need column = [部门]\n\"访问次数大于\" so we need column = [访问次数]\nBased on the tables, columns, and Foreign_keys, The set of possible cell values are = [1000]. So the Schema_links are:\nSchema_links: [部门,访问次数,1000]",
|
|
||||||
"generatedSchemaLinkings": "[部门,访问次数,1000]"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"question": "陈亿训唱的所有的播放量大于20k的孤勇者有哪些",
|
|
||||||
"questionAugmented": "陈亿训唱的所有的播放量大于20k的孤勇者有哪些 (补充信息:’'199509'‘是一个’MPPM歌手ID‘,’ '1527123'‘是一个’MPPM歌曲ID‘。当前的日期是2023-09-18) (备注: )",
|
|
||||||
"modelName": "歌曲库",
|
|
||||||
"dbSchema": "Table: 歌曲库, Columns = [\"歌曲名\", \"MPPM歌手ID\", \"歌曲版本\", \"歌曲类型\", \"MPPM歌曲ID\", \"是否严选窄口径歌曲\", \"是否严选宽口径歌曲\", \"是否潮流人歌曲\", \"超声波歌曲ID\", \"C音歌曲ID\", \"C音歌曲MID\", \"结算播放量\", \"运营播放量\", \"分享量\", \"收藏量\", \"运营搜播量\", \"结算搜播量\", \"拉新用户数\", \"拉活用户数\", \"分享率\", \"结算播放份额\", \"数据日期\"]",
|
|
||||||
"sql": "SELECT 歌曲名 FROM 歌曲库 WHERE 结算播放量 > 20000 AND 歌手名 = '陈亿训' AND 歌曲名 = '孤勇者'",
|
|
||||||
"generatedSchemaLinkingCoT": "Let’s think step by step. In the question \"陈亿训唱的所有的播放量大于20k的孤勇者有哪些 (补充信息:’'199509'‘是一个’MPPM歌手ID‘,’ '1527123'‘是一个’MPPM歌曲ID‘。当前的日期是2023-09-18) (备注: )\", we are asked:\n\"歌曲ID‘。\" so we need column = [歌曲名]\n\"的所有的播放量\" so we need column = [结算播放量]\n\"歌手ID‘,\" so we need column = [歌手名]\nBased on the tables, columns, and Foreign_keys, The set of possible cell values are = [孤勇者,20000,陈亿训]. So the Schema_links are:\nSchema_links: [歌曲名,结算播放量,歌手名,孤勇者,20000,陈亿训]",
|
|
||||||
"generatedSchemaLinkings": "[歌曲名,结算播放量,歌手名,孤勇者,20000,陈亿训]"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"question": "周洁轮去年发布的歌曲有哪些",
|
|
||||||
"questionAugmented": "周洁轮去年发布的歌曲有哪些 (补充信息:’'23109'‘是一个’MPPM歌手ID‘。当前的日期是2023-09-18) (备注: )",
|
|
||||||
"modelName": "歌曲库",
|
|
||||||
"dbSchema": "Table: 歌曲库, Columns = [\"歌曲名\", \"歌曲版本\", \"歌手名\", \"歌曲类型\", \"发布时间\", \"MPPM歌曲ID\", \"是否严选窄口径歌曲\", \"是否严选宽口径歌曲\", \"是否潮流人歌曲\", \"超声波歌曲ID\", \"C音歌曲ID\", \"C音歌曲MID\", \"结算播放量\", \"运营播放量\", \"分享量\", \"收藏量\", \"运营搜播量\", \"结算搜播量\", \"拉新用户数\", \"拉活用户数\", \"分享率\", \"结算播放份额\", \"数据日期\"]",
|
|
||||||
"sql": "SELECT 歌曲名 FROM 歌曲库 WHERE datediff('year', 发布时间, '2023-09-18') <= 1 AND 歌手名 = '周洁轮'",
|
|
||||||
"generatedSchemaLinkingCoT": "Let’s think step by step. In the question \"周洁轮去年发布的歌曲有哪些 (补充信息:’'23109'‘是一个’MPPM歌手ID‘。当前的日期是2023-09-18) (备注: )\", we are asked:\n\"歌曲有哪些 \" so we need column = [歌曲名]\n\"歌手ID‘。\" so we need column = [歌手名]\n\"发布的歌曲有\" so we need column = [发布时间]\nBased on the tables, columns, and Foreign_keys, The set of possible cell values are = [1,周洁轮,year]. So the Schema_links are:\nSchema_links: [歌曲名,歌手名,发布时间,1,周洁轮,year]",
|
|
||||||
"generatedSchemaLinkings": "[歌曲名,歌手名,发布时间,1,周洁轮,year]"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"question": "我想要近半年签约的播放量前十的歌手有哪些",
|
|
||||||
"questionAugmented": "我想要近半年签约的播放量前十的歌手有哪些 (补充信息:。当前的日期是2023-09-11) (备注: )",
|
|
||||||
"modelName": "艺人库",
|
|
||||||
"dbSchema": "Table: 艺人库, Columns = [\"播放量层级\", \"播放量单调性\", \"播放量方差\", \"播放量突增类型\", \"播放量集中度\", \"歌手名\", \"歌手等级\", \"歌手类型\", \"歌手来源\", \"签约日期\", \"MPPM潮流人等级\", \"结算播放量\", \"运营播放量\", \"历史累计结算播放量\", \"有播放量歌曲数\", \"历史累计运营播放量\", \"付费用户结算播放量\", \"结算播放量占比\", \"运营播放份额\", \"免费用户结算播放占比\", \"完播量\", \"数据日期\"]",
|
|
||||||
"sql": "SELECT 歌手名 FROM 艺人库 WHERE datediff('year', 签约日期, '2023-09-11') <= 0.5 ORDER BY 结算播放量 DESC LIMIT 10",
|
|
||||||
"generatedSchemaLinkingCoT": "Let’s think step by step. In the question \"我想要近半年签约的播放量前十的歌手有哪些 (补充信息:。当前的日期是2023-09-11) (备注: )\", we are asked:\n\"签约的播放量\" so we need column = [签约日期]\n\"歌手有哪些 \" so we need column = [歌手名]\n\"签约的播放量\" so we need column = [结算播放量]\nBased on the tables, columns, and Foreign_keys, The set of possible cell values are = [10,0.5,year]. So the Schema_links are:\nSchema_links: [签约日期,歌手名,结算播放量,10,0.5,year]",
|
|
||||||
"generatedSchemaLinkings": "[签约日期,歌手名,结算播放量,10,0.5,year]"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"question": "最近一年发行的歌曲中,有哪些在近7天播放超过一千万的",
|
|
||||||
"questionAugmented": "最近一年发行的歌曲中,有哪些在近7天播放超过一千万的 (补充信息:。当前的日期是2023-08-12) (备注: )",
|
|
||||||
"modelName": "歌曲库",
|
|
||||||
"dbSchema": "Table: 歌曲库, Columns = [\"发行日期\", \"歌曲语言\", \"歌曲来源\", \"歌曲流派\", \"歌曲名\", \"歌曲版本\", \"歌曲类型\", \"发行时间\", \"数据日期\"]",
|
|
||||||
"sql": "SELECT 歌曲名 FROM 歌曲库 WHERE datediff('year', 发行日期, '2023-08-12') <= 1 AND datediff('day', 数据日期, '2023-08-12') <= 7 AND 结算播放量 > 10000000",
|
|
||||||
"generatedSchemaLinkingCoT": "Let’s think step by step. In the question \"最近一年发行的歌曲中,有哪些在近7天播放超过一千万的 (补充信息:。当前的日期是2023-08-12) (备注: )\", we are asked:\n\"的歌曲中,有\" so we need column = [歌曲名]\n\"当前的日期是\" so we need column = [数据日期]\n\"天播放超过一\" so we need column = [结算播放量]\n\"最近一年发行\" so we need column = [发行日期]\nBased on the tables, columns, and Foreign_keys, The set of possible cell values are = [10000000,1,7,year,day]. So the Schema_links are:\nSchema_links: [歌曲名,数据日期,结算播放量,发行日期,10000000,1,7,year,day]",
|
|
||||||
"generatedSchemaLinkings": "[歌曲名,数据日期,结算播放量,发行日期,10000000,1,7,year,day]"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"question": "今年以来发行的歌曲中,有哪些在近7天播放超过一千万的",
|
|
||||||
"questionAugmented": "今年以来发行的歌曲中,有哪些在近7天播放超过一千万的 (补充信息:。当前的日期是2023-08-12) (备注: )",
|
|
||||||
"modelName": "歌曲库",
|
|
||||||
"dbSchema": "Table: 歌曲库, Columns = [\"发行日期\", \"歌曲语言\", \"歌曲来源\", \"歌曲流派\", \"歌曲名\", \"歌曲版本\", \"歌曲类型\", \"发行时间\", \"数据日期\"]",
|
|
||||||
"sql": "SELECT 歌曲名 FROM 歌曲库 WHERE datediff('year', 发行日期, '2023-08-12') <= 0 AND datediff('day', 数据日期, '2023-08-12') <= 7 AND 结算播放量 > 10000000",
|
|
||||||
"generatedSchemaLinkingCoT": "Let’s think step by step. In the question \"今年以来发行的歌曲中,有哪些在近7天播放超过一千万的 (补充信息:。当前的日期是2023-08-12) (备注: )\", we are asked:\n\"的歌曲中,有\" so we need column = [歌曲名]\n\"当前的日期是\" so we need column = [数据日期]\n\"天播放超过一\" so we need column = [结算播放量]\n\"年以来发行的\" so we need column = [发行日期]\nBased on the tables, columns, and Foreign_keys, The set of possible cell values are = [10000000,0,7,year,day]. So the Schema_links are:\nSchema_links: [歌曲名,数据日期,结算播放量,发行日期,10000000,0,7,year,day]",
|
|
||||||
"generatedSchemaLinkings": "[歌曲名,数据日期,结算播放量,发行日期,10000000,0,7,year,day]"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"question": "2023年以来发行的歌曲中,有哪些在近7天播放超过一千万的",
|
|
||||||
"questionAugmented": "2023年以来发行的歌曲中,有哪些在近7天播放超过一千万的 (补充信息:’'514129144'‘是一个’MPPM歌曲ID‘。当前的日期是2023-08-12) (备注: )",
|
|
||||||
"modelName": "歌曲库",
|
|
||||||
"dbSchema": "Table: 歌曲库, Columns = [\"发行日期\", \"歌曲语言\", \"歌曲来源\", \"歌曲流派\", \"歌曲名\", \"歌曲版本\", \"歌曲类型\", \"发行时间\", \"数据日期\"]",
|
|
||||||
"sql": "SELECT 歌曲名 FROM 歌曲库 WHERE 发行日期 >= '2023-01-01' AND datediff('day', 数据日期, '2023-08-12') <= 7 AND 结算播放量 > 10000000",
|
|
||||||
"generatedSchemaLinkingCoT": "Let’s think step by step. In the question \"2023年以来发行的歌曲中,有哪些在近7天播放超过一千万的 (补充信息:’'514129144'‘是一个’MPPM歌曲ID‘。当前的日期是2023-08-12) (备注: )\", we are asked:\n\"的歌曲中,有\" so we need column = [歌曲名]\n\"当前的日期是\" so we need column = [数据日期]\n\"天播放超过一\" so we need column = [结算播放量]\n\"年以来发行的\" so we need column = [发行日期]\nBased on the tables, columns, and Foreign_keys, The set of possible cell values are = [10000000,2023-01-01,7,day]. So the Schema_links are:\nSchema_links: [歌曲名,数据日期,结算播放量,发行日期,10000000,2023-01-01,7,day]",
|
|
||||||
"generatedSchemaLinkings": "[歌曲名,数据日期,结算播放量,发行日期,10000000,2023-01-01,7,day]"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"question": "周洁轮2023年6月之后发布的歌曲有哪些",
|
|
||||||
"questionAugmented": "周洁轮2023年6月之后发布的歌曲有哪些 (补充信息:’'23109'‘是一个’MPPM歌手ID‘。当前的日期是2023-08-01) (备注: )",
|
|
||||||
"modelName": "歌曲库",
|
|
||||||
"dbSchema": "Table: 歌曲库, Columns = [\"歌曲名\", \"歌曲版本\", \"歌手名\", \"歌曲类型\", \"发布时间\", \"MPPM歌曲ID\", \"是否严选窄口径歌曲\", \"是否严选宽口径歌曲\", \"是否潮流人歌曲\", \"超声波歌曲ID\", \"C音歌曲ID\", \"C音歌曲MID\", \"结算播放量\", \"运营播放量\", \"分享量\", \"收藏量\", \"运营搜播量\", \"结算搜播量\", \"拉新用户数\", \"拉活用户数\", \"分享率\", \"结算播放份额\", \"数据日期\"]",
|
|
||||||
"sql": "SELECT 歌曲名 FROM 歌曲库 WHERE 发布时间 >= '2023-06-01' AND 歌手名 = '周洁轮'",
|
|
||||||
"generatedSchemaLinkingCoT": "Let’s think step by step. In the question \"周洁轮2023年6月之后发布的歌曲有哪些 (补充信息:’'23109'‘是一个’MPPM歌手ID‘。当前的日期是2023-08-01) (备注: )\", we are asked:\n\"歌曲有哪些 \" so we need column = [歌曲名]\n\"歌手ID‘。\" so we need column = [歌手名]\n\"月之后发布的\" so we need column = [发布时间]\nBased on the tables, columns, and Foreign_keys, The set of possible cell values are = [2023-06-01,周洁轮]. So the Schema_links are:\nSchema_links: [歌曲名,歌手名,发布时间,2023-06-01,周洁轮]",
|
|
||||||
"generatedSchemaLinkings": "[歌曲名,歌手名,发布时间,2023-06-01,周洁轮]"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"question": "邓梓琦在2023年1月5日之后发布的歌曲中,有哪些播放量大于500W的?",
|
|
||||||
"questionAugmented": "邓梓琦在2023年1月5日之后发布的歌曲中,有哪些播放量大于500W的? (补充信息:’'2312311'‘是一个’MPPM歌手ID‘。当前的日期是2023-08-01) (备注: )",
|
|
||||||
"modelName": "歌曲库",
|
|
||||||
"dbSchema": "Table: 歌曲库, Columns = [\"歌曲名\", \"歌曲版本\", \"歌手名\", \"歌曲类型\", \"发布时间\", \"MPPM歌曲ID\", \"是否严选窄口径歌曲\", \"是否严选宽口径歌曲\", \"是否潮流人歌曲\", \"超声波歌曲ID\", \"C音歌曲ID\", \"C音歌曲MID\", \"结算播放量\", \"运营播放量\", \"分享量\", \"收藏量\", \"运营搜播量\", \"结算搜播量\", \"拉新用户数\", \"拉活用户数\", \"分享率\", \"结算播放份额\", \"数据日期\"]",
|
|
||||||
"sql": "SELECT 歌曲名 FROM 歌曲库 WHERE 发布时间 >= '2023-01-05' AND 歌手名 = '邓梓琦' AND 结算播放量 > 5000000",
|
|
||||||
"generatedSchemaLinkingCoT": "Let’s think step by step. In the question \"邓梓琦在2023年1月5日之后发布的歌曲中,有哪些播放量大于500W的? (补充信息:’'2312311'‘是一个’MPPM歌手ID‘。当前的日期是2023-08-01) (备注: )\", we are asked:\n\"的歌曲中,有\" so we need column = [歌曲名]\n\"中,有哪些播放量\" so we need column = [结算播放量]\n\"歌手ID‘。\" so we need column = [歌手名]\n\"日之后发布的\" so we need column = [发布时间]\nBased on the tables, columns, and Foreign_keys, The set of possible cell values are = [5000000,2023-01-05,邓梓琦]. So the Schema_links are:\nSchema_links: [歌曲名,结算播放量,歌手名,发布时间,5000000,2023-01-05,邓梓琦]",
|
|
||||||
"generatedSchemaLinkings": "[歌曲名,结算播放量,歌手名,发布时间,5000000,2023-01-05,邓梓琦]"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"question": "2023年6月以后,张亮英播放量大于200万的歌曲有哪些?",
|
|
||||||
"questionAugmented": "2023年6月以后,张亮英播放量大于200万的歌曲有哪些? (补充信息:’'45453'‘是一个’MPPM歌手ID‘。当前的日期是2023-09-17) (备注: )",
|
|
||||||
"modelName": "歌曲库",
|
|
||||||
"dbSchema": "Table: 歌曲库, Columns = [\"歌曲名\", \"歌曲版本\", \"歌手名\", \"歌曲类型\", \"发布时间\", \"MPPM歌曲ID\", \"是否严选窄口径歌曲\", \"是否严选宽口径歌曲\", \"是否潮流人歌曲\", \"超声波歌曲ID\", \"C音歌曲ID\", \"C音歌曲MID\", \"结算播放量\", \"运营播放量\", \"分享量\", \"收藏量\", \"运营搜播量\", \"结算搜播量\", \"拉新用户数\", \"拉活用户数\", \"分享率\", \"结算播放份额\", \"数据日期\"]",
|
|
||||||
"sql": "SELECT 歌曲名 FROM 歌曲库 WHERE 数据日期 >= '2023-06-01' AND 歌手名 = '张亮英' AND 结算播放量 > 2000000",
|
|
||||||
"generatedSchemaLinkingCoT": "Let’s think step by step. In the question \"2023年6月以后,张亮英播放量大于200万的歌曲有哪些? (补充信息:’'45453'‘是一个’MPPM歌手ID‘。当前的日期是2023-09-17) (备注: )\", we are asked:\n\"的歌曲有哪些? (\" so we need column = [歌曲名]\n\"当前的日期是\" so we need column = [数据日期]\n\"后,张亮英播放量大\" so we need column = [结算播放量]\n\"歌手ID‘。\" so we need column = [歌手名]\nBased on the tables, columns, and Foreign_keys, The set of possible cell values are = [2000000,2023-06-01,张亮英]. So the Schema_links are:\nSchema_links: [歌曲名,数据日期,结算播放量,歌手名,2000000,2023-06-01,张亮英]",
|
|
||||||
"generatedSchemaLinkings": "[歌曲名,数据日期,结算播放量,歌手名,2000000,2023-06-01,张亮英]"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"question": "2021年6月以后发布的李雨纯的播放量大于20万的歌曲有哪些",
|
|
||||||
"questionAugmented": "2021年6月以后发布的李雨纯的播放量大于20万的歌曲有哪些 (补充信息:’'23109'‘是一个’MPPM歌手ID‘。当前的日期是2023-08-16) (备注: )",
|
|
||||||
"modelName": "歌曲库",
|
|
||||||
"dbSchema": "Table: 歌曲库, Columns = [\"歌曲名\", \"歌曲版本\", \"歌手名\", \"歌曲类型\", \"发布时间\", \"MPPM歌曲ID\", \"是否严选窄口径歌曲\", \"是否严选宽口径歌曲\", \"是否潮流人歌曲\", \"超声波歌曲ID\", \"C音歌曲ID\", \"C音歌曲MID\", \"结算播放量\", \"运营播放量\", \"分享量\", \"收藏量\", \"运营搜播量\", \"结算搜播量\", \"拉新用户数\", \"拉活用户数\", \"分享率\", \"结算播放份额\", \"数据日期\"]",
|
|
||||||
"sql": "SELECT 歌曲名 FROM 歌曲库 WHERE 发布时间 >= '2021-06-01' AND 歌手名 = '李雨纯' AND 结算播放量 > 200000",
|
|
||||||
"generatedSchemaLinkingCoT": "Let’s think step by step. In the question \"2021年6月以后发布的李雨纯的播放量大于20万的歌曲有哪些 (补充信息:’'23109'‘是一个’MPPM歌手ID‘。当前的日期是2023-08-16) (备注: )\", we are asked:\n\"歌曲有哪些 \" so we need column = [歌曲名]\n\"的播放量大于\" so we need column = [结算播放量]\n\"歌手ID‘。\" so we need column = [歌手名]\n\"月以后发布的\" so we need column = [发布时间]\nBased on the tables, columns, and Foreign_keys, The set of possible cell values are = [200000,2021-06-01,李雨纯]. So the Schema_links are:\nSchema_links: [歌曲名,结算播放量,歌手名,发布时间,200000,2021-06-01,李雨纯]",
|
|
||||||
"generatedSchemaLinkings": "[歌曲名,结算播放量,歌手名,发布时间,200000,2021-06-01,李雨纯]"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"question": "刘锝桦在1992年4月2日到2020年5月2日之间发布的播放量大于20万的歌曲有哪些",
|
|
||||||
"questionAugmented": "刘锝桦在1992年4月2日到2020年5月2日之间发布的播放量大于20万的歌曲有哪些 (补充信息:’'4234234'‘是一个’MPPM歌手ID‘。当前的日期是2023-08-16) (备注: )",
|
|
||||||
"modelName": "歌曲库",
|
|
||||||
"dbSchema": "Table: 歌曲库, Columns = [\"歌曲名\", \"歌曲版本\", \"歌手名\", \"歌曲类型\", \"发布时间\", \"MPPM歌曲ID\", \"是否严选窄口径歌曲\", \"是否严选宽口径歌曲\", \"是否潮流人歌曲\", \"超声波歌曲ID\", \"C音歌曲ID\", \"C音歌曲MID\", \"结算播放量\", \"运营播放量\", \"分享量\", \"收藏量\", \"运营搜播量\", \"结算搜播量\", \"拉新用户数\", \"拉活用户数\", \"分享率\", \"结算播放份额\", \"数据日期\"]",
|
|
||||||
"sql": "SELECT 歌曲名 FROM 歌曲库 WHERE 发布时间 >= '1992-04-02' AND 发布时间 <= '2020-05-02' AND 歌手名 = '刘锝桦' AND 结算播放量 > 200000",
|
|
||||||
"generatedSchemaLinkingCoT": "Let’s think step by step. In the question \"刘锝桦在1992年4月2日到2020年5月2日之间发布的播放量大于20万的歌曲有哪些 (补充信息:’'4234234'‘是一个’MPPM歌手ID‘。当前的日期是2023-08-16) (备注: )\", we are asked:\n\"歌曲有哪些 \" so we need column = [歌曲名]\n\"发布的播放量\" so we need column = [结算播放量]\n\"歌手ID‘。\" so we need column = [歌手名]\n\"日之间发布的\" so we need column = [发布时间]\nBased on the tables, columns, and Foreign_keys, The set of possible cell values are = [200000,刘锝桦,1992-04-02,2020-05-02]. So the Schema_links are:\nSchema_links: [歌曲名,结算播放量,歌手名,发布时间,200000,刘锝桦,1992-04-02,2020-05-02]",
|
|
||||||
"generatedSchemaLinkings": "[歌曲名,结算播放量,歌手名,发布时间,200000,刘锝桦,1992-04-02,2020-05-02]"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"question": "超音数近30天访问次数的平均数",
|
|
||||||
"questionAugmented": "超音数近30天访问次数的平均数 (补充信息:。当前的日期是2023-09-04) (备注: )",
|
|
||||||
"modelName": "超音数产品",
|
|
||||||
"dbSchema": "Table: 超音数产品, Columns = [\"用户名\", \"部门\", \"模块\", \"访问时长\", \"访问次数\", \"访问人数\", \"数据日期\"]",
|
|
||||||
"sql": "SELECT AVG(访问次数) FROM 超音数产品 WHERE datediff('day', 数据日期, '2023-09-04') <= 30 ",
|
|
||||||
"generatedSchemaLinkingCoT": "Let’s think step by step. In the question \"超音数近30天访问次数的平均数 (补充信息:。当前的日期是2023-09-04) (备注: )\", we are asked:\n\"当前的日期是\" so we need column = [数据日期]\n\"访问次数的平均数\" so we need column = [访问次数]\nBased on the tables, columns, and Foreign_keys, The set of possible cell values are = [30,day]. So the Schema_links are:\nSchema_links: [数据日期,访问次数,30,day]",
|
|
||||||
"generatedSchemaLinkings": "[数据日期,访问次数,30,day]"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"question": "超音数近半年哪个月的访问次数汇总最高",
|
|
||||||
"questionAugmented": "超音数近半年哪个月的访问次数汇总最高 (补充信息:。当前的日期是2023-09-04) (备注: )",
|
|
||||||
"modelName": "超音数产品",
|
|
||||||
"dbSchema": "Table: 超音数产品, Columns = [\"用户名\", \"部门\", \"模块\", \"访问时长\", \"访问次数\", \"访问人数\", \"数据日期\"]",
|
|
||||||
"sql": "SELECT MONTH(数据日期), SUM(访问次数) FROM 超音数产品 WHERE datediff('year', 数据日期, '2023-09-04') <= 0.5 GROUP BY MONTH(数据日期) ORDER BY SUM(访问次数) DESC LIMIT 1",
|
|
||||||
"generatedSchemaLinkingCoT": "Let’s think step by step. In the question \"超音数近半年哪个月的访问次数汇总最高 (补充信息:。当前的日期是2023-09-04) (备注: )\", we are asked:\n\"当前的日期是\" so we need column = [数据日期]\n\"的访问次数汇总\" so we need column = [访问次数]\nBased on the tables, columns, and Foreign_keys, The set of possible cell values are = [1,0.5,year]. So the Schema_links are:\nSchema_links: [数据日期,访问次数,1,0.5,year]",
|
|
||||||
"generatedSchemaLinkings": "[数据日期,访问次数,1,0.5,year]"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"question": "超音数近半年每个月的平均访问次数",
|
|
||||||
"questionAugmented": "超音数近半年每个月的平均访问次数 (补充信息:。当前的日期是2023-09-04) (备注: )",
|
|
||||||
"modelName": "超音数产品",
|
|
||||||
"dbSchema": "Table: 超音数产品, Columns = [\"用户名\", \"部门\", \"模块\", \"访问时长\", \"访问次数\", \"访问人数\", \"数据日期\"]",
|
|
||||||
"sql": "SELECT MONTH(数据日期), AVG(访问次数) FROM 超音数产品 WHERE datediff('year', 数据日期, '2023-09-04') <= 0.5 GROUP BY MONTH(数据日期)",
|
|
||||||
"generatedSchemaLinkingCoT": "Let’s think step by step. In the question \"超音数近半年每个月的平均访问次数 (补充信息:。当前的日期是2023-09-04) (备注: )\", we are asked:\n\"当前的日期是\" so we need column = [数据日期]\n\"访问次数 (\" so we need column = [访问次数]\nBased on the tables, columns, and Foreign_keys, The set of possible cell values are = [0.5,year]. So the Schema_links are:\nSchema_links: [数据日期,访问次数,0.5,year]",
|
|
||||||
"generatedSchemaLinkings": "[数据日期,访问次数,0.5,year]"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"question": "超音数 按部门统计访问次数 top10 的部门",
|
|
||||||
"questionAugmented": "超音数 按部门统计访问次数 top10 的部门 (补充信息:。当前的日期是2023-09-10) (备注: )",
|
|
||||||
"modelName": "超音数产品",
|
|
||||||
"dbSchema": "Table: 超音数产品, Columns = [\"用户名\", \"部门\", \"模块\", \"访问时长\", \"访问次数\", \"访问人数\", \"数据日期\"]",
|
|
||||||
"sql": "SELECT 部门, SUM(访问次数) FROM 超音数产品 GROUP BY 部门 ORDER BY SUM(访问次数) DESC LIMIT 10",
|
|
||||||
"generatedSchemaLinkingCoT": "Let’s think step by step. In the question \"超音数 按部门统计访问次数 top10 的部门 (补充信息:。当前的日期是2023-09-10) (备注: )\", we are asked:\n\" 的部门 (\" so we need column = [部门]\n\"计访问次数 \" so we need column = [访问次数]\nBased on the tables, columns, and Foreign_keys, The set of possible cell values are = [10]. So the Schema_links are:\nSchema_links: [部门,访问次数,10]",
|
|
||||||
"generatedSchemaLinkings": "[部门,访问次数,10]"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"question": "超音速 近7个月,月度总访问量超过 2万的月份",
|
|
||||||
"questionAugmented": "超音速 近7个月,月度总访问量超过 2万的月份 (补充信息:。当前的日期是2023-09-10) (备注: )",
|
|
||||||
"modelName": "超音数产品",
|
|
||||||
"dbSchema": "Table: 超音数产品, Columns = [\"用户名\", \"部门\", \"模块\", \"访问时长\", \"访问次数\", \"访问人数\", \"数据日期\"]",
|
|
||||||
"sql": "SELECT MONTH(数据日期) FROM 超音数产品 WHERE datediff('month', 数据日期, '2023-09-10') <= 7 GROUP BY MONTH(数据日期) HAVING SUM(访问次数) > 20000",
|
|
||||||
"generatedSchemaLinkingCoT": "Let’s think step by step. In the question \"超音速 近7个月,月度总访问量超过 2万的月份 (补充信息:。当前的日期是2023-09-10) (备注: )\", we are asked:\n\"当前的日期是\" so we need column = [数据日期]\n\"访问量超过 \" so we need column = [访问次数]\nBased on the tables, columns, and Foreign_keys, The set of possible cell values are = [7,20000,month]. So the Schema_links are:\nSchema_links: [数据日期,访问次数,7,20000,month]",
|
|
||||||
"generatedSchemaLinkings": "[数据日期,访问次数,7,20000,month]"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"question": "2022年7月到2023年7月之间发布到歌曲,按播放量取top 100,再按月粒度来统计近1年的运营播放量",
|
|
||||||
"questionAugmented": "2022年7月到2023年7月之间发布到歌曲,按播放量取top 100,再按月粒度来统计近1年的运营播放量 (补充信息:。当前的日期是2023-09-10) (备注: )",
|
|
||||||
"modelName": "歌曲库",
|
|
||||||
"dbSchema": "Table: 歌曲库, Columns = [\"歌曲语言\", \"歌曲来源\", \"运营播放量\", \"播放量\", \"歌曲名\", \"结算播放量\", \"专辑名\", \"发布日期\", \"歌曲版本\", \"歌曲类型\", \"数据日期\"]",
|
|
||||||
"sql": "SELECT MONTH(数据日期), SUM(运营播放量) FROM (SELECT 数据日期, 运营播放量 FROM 歌曲库 WHERE 发布日期 >= '2022-07-01' AND 发布日期 <= '2023-07-01' ORDER BY 播放量 DESC LIMIT 100) t WHERE datediff('year', 数据日期, '2023-09-10') <= 1 GROUP BY MONTH(数据日期)",
|
|
||||||
"generatedSchemaLinkingCoT": "Let’s think step by step. In the question \"2022年7月到2023年7月之间发布到歌曲,按播放量取top 100,再按月粒度来统计近1年的运营播放量 (补充信息:。当前的日期是2023-09-10) (备注: )\", we are asked:\n\"运营播放量 \" so we need column = [播放量]\n\"当前的日期是\" so we need column = [数据日期]\n\"月之间发布到\" so we need column = [发布日期]\n\"运营播放量 \" so we need column = [运营播放量]\nBased on the tables, columns, and Foreign_keys, The set of possible cell values are = [1,year,100,2022-07-01,2023-07-01]. So the Schema_links are:\nSchema_links: [播放量,数据日期,发布日期,运营播放量,1,year,100,2022-07-01,2023-07-01]",
|
|
||||||
"generatedSchemaLinkings": "[播放量,数据日期,发布日期,运营播放量,1,year,100,2022-07-01,2023-07-01]"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"question": "2022年7月到2023年7月之间发布到歌曲,按播放量取top100,再按月粒度来统计近1年的运营播放量之和,筛选出其中运营播放量之和大于2k的月份",
|
|
||||||
"questionAugmented": "2022年7月到2023年7月之间发布到歌曲,按播放量取top100,再按月粒度来统计近1年的运营播放量之和,筛选出其中运营播放量之和大于2k的月份 (补充信息:。当前的日期是2023-09-10) (备注: )",
|
|
||||||
"modelName": "歌曲库",
|
|
||||||
"dbSchema": "Table: 歌曲库, Columns = [\"歌曲语言\", \"歌曲来源\", \"运营播放量\", \"播放量\", \"歌曲名\", \"结算播放量\", \"专辑名\", \"发布日期\", \"歌曲版本\", \"歌曲类型\", \"数据日期\"]",
|
|
||||||
"sql": "SELECT MONTH(数据日期), SUM(运营播放量) FROM (SELECT 数据日期, 运营播放量 FROM 歌曲库 WHERE 发布日期 >= '2022-07-01' AND 发布日期 <= '2023-07-01' ORDER BY 播放量 DESC LIMIT 100) t WHERE datediff('year', 数据日期, '2023-09-10') <= 1 GROUP BY MONTH(数据日期) HAVING SUM(运营播放量) > 2000",
|
|
||||||
"generatedSchemaLinkingCoT": "Let’s think step by step. In the question \"2022年7月到2023年7月之间发布到歌曲,按播放量取top100,再按月粒度来统计近1年的运营播放量之和,筛选出其中运营播放量之和大于2k的月份 (补充信息:。当前的日期是2023-09-10) (备注: )\", we are asked:\n\"播放量之和,\" so we need column = [播放量]\n\"当前的日期是\" so we need column = [数据日期]\n\"月之间发布到\" so we need column = [发布日期]\n\"运营播放量之\" so we need column = [运营播放量]\nBased on the tables, columns, and Foreign_keys, The set of possible cell values are = [1,2000,year,100,2022-07-01,2023-07-01]. So the Schema_links are:\nSchema_links: [播放量,数据日期,发布日期,运营播放量,1,2000,year,100,2022-07-01,2023-07-01]",
|
|
||||||
"generatedSchemaLinkings": "[播放量,数据日期,发布日期,运营播放量,1,2000,year,100,2022-07-01,2023-07-01]"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"question": "今年智能机在哪个国家的销量之和最高",
|
|
||||||
"questionAugmented": "今年智能机在哪个国家的销量之和最高 (补充信息:’'智能机'‘是一个’机型类别‘。当前的日期是2023-11-01) (备注: )",
|
|
||||||
"modelName": "营销月模型",
|
|
||||||
"dbSchema": "Table: 营销月模型, Columns = [\"国家中文名\", \"机型类别\", \"销量\", \"数据日期\"]",
|
|
||||||
"sql": "SELECT 国家中文名, SUM(销量) FROM 营销月模型 WHERE 机型类别 = '智能机' AND 数据日期 >= '2023-01-01' AND 数据日期 <= '2023-11-01' GROUP BY 国家中文名 ORDER BY SUM(销量) DESC LIMIT 1",
|
|
||||||
"generatedSchemaLinkingCoT": "Let’s think step by step. In the question \"今年智能机在哪个国家的销量之和最高 (补充信息:’'智能机'‘是一个’机型类别‘。当前的日期是2023-11-01) (备注: )\", we are asked:\n\"’机型类别‘\" so we need column = [机型类别]\n\"当前的日期是\" so we need column = [数据日期]\n\"国家的销量之和\" so we need column = [国家中文名]\n\"个国家的销量\" so we need column = [销量]\nBased on the tables, columns, and Foreign_keys, The set of possible cell values are = [1,2023-11-01,智能机,2023-01-01]. So the Schema_links are:\nSchema_links: [机型类别,数据日期,国家中文名,销量,1,2023-11-01,智能机,2023-01-01]",
|
|
||||||
"generatedSchemaLinkings": "[机型类别,数据日期,国家中文名,销量,1,2023-11-01,智能机,2023-01-01]"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
@@ -1,21 +0,0 @@
|
|||||||
# -*- coding:utf-8 -*-
|
|
||||||
from typing import Any, List, Mapping, Optional, Union
|
|
||||||
|
|
||||||
import chromadb
|
|
||||||
from chromadb.api import Collection
|
|
||||||
from chromadb.config import Settings
|
|
||||||
|
|
||||||
import os
|
|
||||||
import sys
|
|
||||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
||||||
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
|
||||||
|
|
||||||
from config.config_parse import CHROMA_DB_PERSIST_PATH
|
|
||||||
|
|
||||||
|
|
||||||
client = chromadb.Client(
|
|
||||||
Settings(
|
|
||||||
chroma_db_impl="duckdb+parquet",
|
|
||||||
persist_directory=CHROMA_DB_PERSIST_PATH, # Optional, defaults to .chromadb/ in the current directory
|
|
||||||
)
|
|
||||||
)
|
|
||||||
@@ -1,26 +0,0 @@
|
|||||||
# -*- coding:utf-8 -*-
|
|
||||||
from langchain import llms
|
|
||||||
|
|
||||||
import os
|
|
||||||
import sys
|
|
||||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
||||||
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
|
||||||
|
|
||||||
from config.config_parse import LLM_PROVIDER_NAME, llm_config_dict
|
|
||||||
|
|
||||||
|
|
||||||
def get_llm(llm_config: dict):
|
|
||||||
if LLM_PROVIDER_NAME in llms.type_to_cls_dict:
|
|
||||||
llm_provider = llms.type_to_cls_dict[LLM_PROVIDER_NAME]
|
|
||||||
if llm_config is None or llm_config["baseUrl"] is None or llm_config["baseUrl"] == '':
|
|
||||||
llm = llm_provider(**llm_config_dict)
|
|
||||||
else:
|
|
||||||
openai_llm_config = {}
|
|
||||||
openai_llm_config["model_name"] = llm_config["modelName"]
|
|
||||||
openai_llm_config["openai_api_base"] = llm_config["baseUrl"]
|
|
||||||
openai_llm_config["openai_api_key"] = llm_config["apiKey"]
|
|
||||||
openai_llm_config["temperature"] = llm_config["temperature"]
|
|
||||||
llm = llm_provider(**openai_llm_config)
|
|
||||||
return llm
|
|
||||||
else:
|
|
||||||
raise Exception("llm_provider_name is not supported: {}".format(LLM_PROVIDER_NAME))
|
|
||||||
@@ -1,14 +0,0 @@
|
|||||||
# -*- coding:utf-8 -*-
|
|
||||||
|
|
||||||
from loguru import logger
|
|
||||||
import sys
|
|
||||||
import os
|
|
||||||
|
|
||||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
|
|
||||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
||||||
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
|
||||||
|
|
||||||
from config.config_parse import LOG_FILE_PATH
|
|
||||||
|
|
||||||
logger.remove() #remove the old handler. Else, the old one will work along with the new one you've added below'
|
|
||||||
logger.add(LOG_FILE_PATH, rotation="500 MB", retention="7 days", format="{time:YYYY-MM-DD at HH:mm:ss} | {level} | {message}", level="INFO")
|
|
||||||
@@ -1,11 +0,0 @@
|
|||||||
# -*- coding:utf-8 -*-
|
|
||||||
import os
|
|
||||||
import sys
|
|
||||||
|
|
||||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
|
|
||||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
||||||
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
|
||||||
|
|
||||||
from utils.text2vec import Text2VecEmbeddingFunction
|
|
||||||
|
|
||||||
emb_func = Text2VecEmbeddingFunction()
|
|
||||||
@@ -1,32 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
#
|
|
||||||
# Licensed to the Apache Software Foundation (ASF) under one or more
|
|
||||||
# contributor license agreements. See the NOTICE file distributed with
|
|
||||||
# this work for additional information regarding copyright ownership.
|
|
||||||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
|
||||||
# (the "License"); you may not use this file except in compliance with
|
|
||||||
# the License. You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
#
|
|
||||||
|
|
||||||
set -x
|
|
||||||
|
|
||||||
|
|
||||||
PROFILES="-P "
|
|
||||||
|
|
||||||
# python style checks rely on `black` in path
|
|
||||||
if ! command -v black &> /dev/null
|
|
||||||
then
|
|
||||||
echo "Skip Python lint since 'black' is not available. Please install 'black' by running 'pip install black==22.3.0'"
|
|
||||||
else
|
|
||||||
PROFILES="${PROFILES} spotless-python"
|
|
||||||
fi
|
|
||||||
|
|
||||||
mvn spotless:apply $PROFILES
|
|
||||||
@@ -1,9 +0,0 @@
|
|||||||
langchain==0.0.207
|
|
||||||
openai==0.27.4
|
|
||||||
fastapi==0.95.1
|
|
||||||
chromadb==0.3.26
|
|
||||||
tiktoken==0.3.3
|
|
||||||
uvicorn[standard]==0.21.1
|
|
||||||
pandas==1.5.3
|
|
||||||
loguru==0.7.2
|
|
||||||
sqlglot==19.5.1
|
|
||||||
@@ -1,99 +0,0 @@
|
|||||||
# -*- coding:utf-8 -*-
|
|
||||||
import json
|
|
||||||
import os
|
|
||||||
import re
|
|
||||||
import sys
|
|
||||||
from typing import Any, List, Mapping, Union
|
|
||||||
|
|
||||||
from instances.logging_instance import logger
|
|
||||||
|
|
||||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
||||||
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
|
||||||
|
|
||||||
|
|
||||||
def construct_plugin_prompt(tool_config):
|
|
||||||
tool_name = tool_config["name"]
|
|
||||||
tool_description = tool_config["description"]
|
|
||||||
tool_examples = tool_config["examples"]
|
|
||||||
|
|
||||||
prompt = "【工具名称】\n" + tool_name + "\n"
|
|
||||||
prompt += "【工具描述】\n" + tool_description + "\n"
|
|
||||||
|
|
||||||
prompt += "【工具适用问题示例】\n"
|
|
||||||
for example in tool_examples:
|
|
||||||
prompt += example + "\n"
|
|
||||||
return prompt
|
|
||||||
|
|
||||||
|
|
||||||
def construct_plugin_pool_prompt(tool_config_list):
|
|
||||||
tool_explain_list = []
|
|
||||||
for tool_config in tool_config_list:
|
|
||||||
tool_explain = construct_plugin_prompt(tool_config)
|
|
||||||
tool_explain_list.append(tool_explain)
|
|
||||||
|
|
||||||
tool_explain_list_str = "\n\n".join(tool_explain_list)
|
|
||||||
|
|
||||||
return tool_explain_list_str
|
|
||||||
|
|
||||||
|
|
||||||
def construct_task_prompt(query_text, tool_explain_list_str):
|
|
||||||
instruction = """问题为:{query_text}\n请根据问题和工具的描述,选择对应的工具,完成任务。请注意,只能选择1个工具。请一步一步地分析选择工具的原因(每个工具的【工具适用问题示例】是选择的重要参考依据),并给出最终选择,输出格式为json,key为’分析过程‘, ’选择工具‘""".format(
|
|
||||||
query_text=query_text
|
|
||||||
)
|
|
||||||
|
|
||||||
prompt = "工具选择如下:\n\n{tool_explain_list_str}\n\n【任务说明】\n{instruction}".format(
|
|
||||||
instruction=instruction, tool_explain_list_str=tool_explain_list_str
|
|
||||||
)
|
|
||||||
|
|
||||||
return prompt
|
|
||||||
|
|
||||||
|
|
||||||
def plugin_selection_output_parse(llm_output: str) -> Union[Mapping[str, str], None]:
|
|
||||||
try:
|
|
||||||
pattern = r"\{[^{}]+\}"
|
|
||||||
find_result = re.findall(pattern, llm_output)
|
|
||||||
result = find_result[0].strip()
|
|
||||||
|
|
||||||
logger.info("result: {}", result)
|
|
||||||
|
|
||||||
result_dict = json.loads(result)
|
|
||||||
logger.info("result_dict: {}", result_dict)
|
|
||||||
|
|
||||||
key_mapping = {"分析过程": "analysis", "选择工具": "toolSelection"}
|
|
||||||
|
|
||||||
converted_result_dict = {
|
|
||||||
key_mapping[key]: value
|
|
||||||
for key, value in result_dict.items()
|
|
||||||
if key in key_mapping
|
|
||||||
}
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.exception(e)
|
|
||||||
converted_result_dict = None
|
|
||||||
|
|
||||||
return converted_result_dict
|
|
||||||
|
|
||||||
|
|
||||||
def plugins_config_format_convert(
|
|
||||||
plugin_config_list: List[Mapping[str, Any]]
|
|
||||||
) -> List[Mapping[str, Any]]:
|
|
||||||
plugin_config_list_new = []
|
|
||||||
for plugin_config in plugin_config_list:
|
|
||||||
plugin_config_new = dict()
|
|
||||||
name = plugin_config["name"]
|
|
||||||
description = plugin_config["description"]
|
|
||||||
examples = plugin_config["examples"]
|
|
||||||
parameters = plugin_config["parameters"]
|
|
||||||
|
|
||||||
examples_str = "\n".join(examples)
|
|
||||||
description_new = """{plugin_desc}\n\n例如能够处理如下问题:\n{examples_str}""".format(
|
|
||||||
plugin_desc=description, examples_str=examples_str
|
|
||||||
)
|
|
||||||
|
|
||||||
plugin_config_new["name"] = name
|
|
||||||
plugin_config_new["description"] = description_new
|
|
||||||
plugin_config_new["parameters"] = parameters
|
|
||||||
|
|
||||||
plugin_config_list_new.append(plugin_config_new)
|
|
||||||
|
|
||||||
return plugin_config_list_new
|
|
||||||
@@ -1,26 +0,0 @@
|
|||||||
# -*- coding:utf-8 -*-
|
|
||||||
|
|
||||||
import os
|
|
||||||
import sys
|
|
||||||
from typing import Any, List, Mapping, Union
|
|
||||||
|
|
||||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
||||||
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
|
||||||
|
|
||||||
from plugin_call.prompt_construct import (
|
|
||||||
construct_plugin_pool_prompt,
|
|
||||||
construct_task_prompt,
|
|
||||||
plugin_selection_output_parse,
|
|
||||||
)
|
|
||||||
|
|
||||||
# def plugin_selection_run(
|
|
||||||
# query_text: str, plugin_configs: List[Mapping[str, Any]]
|
|
||||||
# ) -> Union[Mapping[str, str], None]:
|
|
||||||
|
|
||||||
# tools_prompt = construct_plugin_pool_prompt(plugin_configs)
|
|
||||||
|
|
||||||
# task_prompt = construct_task_prompt(query_text, tools_prompt)
|
|
||||||
# llm_output = llm(task_prompt)
|
|
||||||
# parsed_output = plugin_selection_output_parse(llm_output)
|
|
||||||
|
|
||||||
# return parsed_output
|
|
||||||
@@ -1,98 +0,0 @@
|
|||||||
# -*- coding:utf-8 -*-
|
|
||||||
|
|
||||||
import os
|
|
||||||
import sys
|
|
||||||
import uuid
|
|
||||||
from typing import Any, List, Mapping, Optional, Union
|
|
||||||
|
|
||||||
import chromadb
|
|
||||||
from chromadb import Client
|
|
||||||
from chromadb.config import Settings
|
|
||||||
from chromadb.api import Collection, Documents, Embeddings
|
|
||||||
from chromadb.api.types import CollectionMetadata
|
|
||||||
|
|
||||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
|
|
||||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
||||||
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
|
||||||
|
|
||||||
from instances.logging_instance import logger
|
|
||||||
from utils.chromadb_utils import (get_chroma_collection_size, query_chroma_collection,
|
|
||||||
parse_retrieval_chroma_collection_query, chroma_collection_query_retrieval_format,
|
|
||||||
get_chroma_collection_by_ids, get_chroma_collection_size,
|
|
||||||
add_chroma_collection, update_chroma_collection, delete_chroma_collection_by_ids,
|
|
||||||
empty_chroma_collection_2)
|
|
||||||
|
|
||||||
from utils.text2vec import Text2VecEmbeddingFunction
|
|
||||||
|
|
||||||
class ChromaCollectionRetriever(object):
|
|
||||||
def __init__(self, collection:Collection):
|
|
||||||
self.collection = collection
|
|
||||||
|
|
||||||
def retrieval_query_run(self, query_texts_list:List[str]=None, query_embeddings:Embeddings=None,
|
|
||||||
filter_condition:Mapping[str,str]=None, n_results:int=5):
|
|
||||||
|
|
||||||
retrieval_res = query_chroma_collection(self.collection, query_texts_list, query_embeddings,
|
|
||||||
filter_condition, n_results)
|
|
||||||
|
|
||||||
parsed_retrieval_res = parse_retrieval_chroma_collection_query(retrieval_res)
|
|
||||||
logger.debug('parsed_retrieval_res: {}', parsed_retrieval_res)
|
|
||||||
parsed_retrieval_res_format = chroma_collection_query_retrieval_format(query_texts_list, query_embeddings, parsed_retrieval_res)
|
|
||||||
logger.debug('parsed_retrieval_res_format: {}', parsed_retrieval_res_format)
|
|
||||||
|
|
||||||
return parsed_retrieval_res_format
|
|
||||||
|
|
||||||
def get_query_by_ids(self, query_ids:List[str]):
|
|
||||||
queries = get_chroma_collection_by_ids(self.collection, query_ids)
|
|
||||||
return queries
|
|
||||||
|
|
||||||
def get_query_size(self):
|
|
||||||
return get_chroma_collection_size(self.collection)
|
|
||||||
|
|
||||||
def add_queries(self, query_text_list:List[str],
|
|
||||||
query_id_list:List[str],
|
|
||||||
metadatas:List[Mapping[str, str]]=None,
|
|
||||||
embeddings:Embeddings=None):
|
|
||||||
add_chroma_collection(self.collection, query_text_list, query_id_list, metadatas, embeddings)
|
|
||||||
return True
|
|
||||||
|
|
||||||
def update_queries(self, query_text_list:List[str],
|
|
||||||
query_id_list:List[str],
|
|
||||||
metadatas:List[Mapping[str, str]]=None,
|
|
||||||
embeddings:Embeddings=None):
|
|
||||||
update_chroma_collection(self.collection, query_text_list, query_id_list, metadatas, embeddings)
|
|
||||||
return True
|
|
||||||
|
|
||||||
def delete_queries_by_ids(self, query_ids:List[str]):
|
|
||||||
delete_chroma_collection_by_ids(self.collection, query_ids)
|
|
||||||
return True
|
|
||||||
|
|
||||||
def empty_query_collection(self):
|
|
||||||
self.collection = empty_chroma_collection_2(self.collection)
|
|
||||||
|
|
||||||
return True
|
|
||||||
|
|
||||||
class CollectionManager(object):
|
|
||||||
def __init__(self, chroma_client:Client, embedding_func: Text2VecEmbeddingFunction, collection_meta: Optional[CollectionMetadata] = None):
|
|
||||||
self.chroma_client = chroma_client
|
|
||||||
self.embedding_func = embedding_func
|
|
||||||
self.collection_meta = collection_meta
|
|
||||||
|
|
||||||
def list_collections(self):
|
|
||||||
collection_list = self.chroma_client.list_collections()
|
|
||||||
return collection_list
|
|
||||||
|
|
||||||
def get_collection(self, collection_name:str):
|
|
||||||
collection = self.chroma_client.get_collection(name=collection_name, embedding_function=self.embedding_func)
|
|
||||||
return collection
|
|
||||||
|
|
||||||
def create_collection(self, collection_name:str):
|
|
||||||
collection = self.chroma_client.create_collection(name=collection_name, embedding_function=self.embedding_func, metadata=self.collection_meta)
|
|
||||||
return collection
|
|
||||||
|
|
||||||
def get_or_create_collection(self, collection_name:str):
|
|
||||||
collection = self.chroma_client.get_or_create_collection(name=collection_name, embedding_function=self.embedding_func, metadata=self.collection_meta)
|
|
||||||
return collection
|
|
||||||
|
|
||||||
def delete_collection(self, collection_name:str):
|
|
||||||
self.chroma_client.delete_collection(collection_name)
|
|
||||||
return True
|
|
||||||
@@ -1,37 +0,0 @@
|
|||||||
# -*- coding:utf-8 -*-
|
|
||||||
|
|
||||||
import os
|
|
||||||
import sys
|
|
||||||
import uuid
|
|
||||||
from typing import Any, List, Mapping, Optional, Union
|
|
||||||
|
|
||||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
||||||
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
|
||||||
|
|
||||||
from instances.logging_instance import logger
|
|
||||||
|
|
||||||
import chromadb
|
|
||||||
from chromadb.config import Settings
|
|
||||||
from chromadb.api import Collection, Documents, Embeddings
|
|
||||||
|
|
||||||
from utils.text2vec import Text2VecEmbeddingFunction
|
|
||||||
from instances.chromadb_instance import client
|
|
||||||
|
|
||||||
from config.config_parse import SOLVED_QUERY_COLLECTION_NAME, PRESET_QUERY_COLLECTION_NAME
|
|
||||||
from retriever import ChromaCollectionRetriever, CollectionManager
|
|
||||||
|
|
||||||
|
|
||||||
emb_func = Text2VecEmbeddingFunction()
|
|
||||||
|
|
||||||
collection_manager = CollectionManager(chroma_client=client, embedding_func=emb_func
|
|
||||||
,collection_meta={"hnsw:space": "cosine"})
|
|
||||||
|
|
||||||
solved_query_collection = collection_manager.get_or_create_collection(collection_name=SOLVED_QUERY_COLLECTION_NAME)
|
|
||||||
preset_query_collection = collection_manager.get_or_create_collection(collection_name=PRESET_QUERY_COLLECTION_NAME)
|
|
||||||
|
|
||||||
|
|
||||||
solved_query_retriever = ChromaCollectionRetriever(solved_query_collection)
|
|
||||||
preset_query_retriever = ChromaCollectionRetriever(preset_query_collection)
|
|
||||||
|
|
||||||
logger.info("init_solved_query_collection_size: {}".format(solved_query_retriever.get_query_size()))
|
|
||||||
logger.info("init_preset_query_collection_size: {}".format(preset_query_retriever.get_query_size()))
|
|
||||||
@@ -1,167 +0,0 @@
|
|||||||
# -*- coding:utf-8 -*-
|
|
||||||
from typing import Any, List, Mapping, Optional, Union, Tuple
|
|
||||||
|
|
||||||
import os
|
|
||||||
import sys
|
|
||||||
|
|
||||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
|
|
||||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
||||||
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
|
||||||
|
|
||||||
from instances.logging_instance import logger
|
|
||||||
from instances.text2vec_instance import emb_func
|
|
||||||
|
|
||||||
from sqlglot import parse_one, exp
|
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
def sql2schema_linking(sql: str):
|
|
||||||
sql_ast = parse_one(sql)
|
|
||||||
|
|
||||||
fields_raw = []
|
|
||||||
table_alias_map = dict()
|
|
||||||
|
|
||||||
literals = []
|
|
||||||
fields = []
|
|
||||||
|
|
||||||
for literal in sql_ast.find_all(exp.Literal):
|
|
||||||
literals.append(literal.output_name)
|
|
||||||
|
|
||||||
for column in sql_ast.find_all(exp.Column):
|
|
||||||
fields_raw.append({
|
|
||||||
'column_table_alias': column.table,
|
|
||||||
'column_name': column.name,
|
|
||||||
})
|
|
||||||
|
|
||||||
for table in sql_ast.find_all(exp.Table):
|
|
||||||
if table.alias not in table_alias_map:
|
|
||||||
table_alias_map[table.alias] = table.name
|
|
||||||
|
|
||||||
logger.debug(f'literals: {literals}')
|
|
||||||
logger.debug(f'fields_raw: {fields_raw}')
|
|
||||||
logger.debug(f'table_alias_map: {table_alias_map}')
|
|
||||||
|
|
||||||
for field in fields_raw:
|
|
||||||
column_table_alias = field['column_table_alias']
|
|
||||||
column_name = field['column_name']
|
|
||||||
|
|
||||||
if column_table_alias.strip() == '':
|
|
||||||
column_table = ''
|
|
||||||
fields.append((column_table, column_name))
|
|
||||||
elif column_table_alias in table_alias_map:
|
|
||||||
column_table = table_alias_map[column_table_alias]
|
|
||||||
fields.append((column_table, column_name))
|
|
||||||
elif column_table_alias in table_alias_map.values():
|
|
||||||
column_table = column_table_alias
|
|
||||||
fields.append((column_table, column_name))
|
|
||||||
else:
|
|
||||||
logger.error(f'column_table_alias: {column_table_alias} not in table_alias_map: {table_alias_map}')
|
|
||||||
raise Exception(f'column_table_alias: {column_table_alias} not in table_alias_map: {table_alias_map}')
|
|
||||||
|
|
||||||
return {
|
|
||||||
'fields': list(set(fields)),
|
|
||||||
'literals': literals
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def get_question_slices(question: str, min_window_size: int, max_window_size: int):
|
|
||||||
assert min_window_size <= max_window_size
|
|
||||||
assert min_window_size > 1
|
|
||||||
assert max_window_size < len(question)+1
|
|
||||||
|
|
||||||
question_slices = []
|
|
||||||
for i in range(len(question)):
|
|
||||||
for j in range(i+1, len(question)+1):
|
|
||||||
if j-i >= min_window_size and j-i <= max_window_size:
|
|
||||||
question_slices.append(question[i:j])
|
|
||||||
|
|
||||||
return question_slices
|
|
||||||
|
|
||||||
|
|
||||||
def schema_linking_match(fields: List[Tuple[str,str]], question: str, min_window_size: int, max_window_size: int):
|
|
||||||
question_slices = get_question_slices(question, min_window_size, max_window_size)
|
|
||||||
assert len(question_slices) > 0
|
|
||||||
logger.debug('question_slices_len:{}'.format(len(question_slices)))
|
|
||||||
logger.debug(f'question_slices: {question_slices}')
|
|
||||||
|
|
||||||
question_slices_embeddings = emb_func(question_slices)
|
|
||||||
fields_embeddings = emb_func([field[1] for field in fields])
|
|
||||||
|
|
||||||
fields_embeddings = np.array(fields_embeddings) # (n_fields, 768)
|
|
||||||
question_slices_embeddings = np.array(question_slices_embeddings) # (n_question_slices, 768)
|
|
||||||
|
|
||||||
question_slices_embeddings_norm = question_slices_embeddings / np.linalg.norm(question_slices_embeddings, axis=1, keepdims=True) # (n_question_slices, 768)
|
|
||||||
question_slices_embeddings_norm_transpose = question_slices_embeddings_norm.T # (768, n_question_slices)
|
|
||||||
|
|
||||||
if len(fields) > 0:
|
|
||||||
fields_embeddings_norm = fields_embeddings / np.linalg.norm(fields_embeddings, axis=1, keepdims=True) # (n_fields, 768)
|
|
||||||
fields_question_slices_similarity = np.matmul(fields_embeddings_norm, question_slices_embeddings_norm_transpose) # (n_fields, n_question_slices)
|
|
||||||
logger.debug('fields_question_slices_similarity_max:{}'.format(np.max(fields_question_slices_similarity, axis=1)))
|
|
||||||
fields_question_slices_argmax = np.argmax(fields_question_slices_similarity, axis=1) # (n_fields, )
|
|
||||||
logger.debug('fields_question_slices_argmax:{}'.format(fields_question_slices_argmax))
|
|
||||||
|
|
||||||
fields_question_slices_pair = []
|
|
||||||
for i in range(len(fields)):
|
|
||||||
if fields[i][0]!="":
|
|
||||||
fields_question_slices_pair.append((fields[i][0]+'.'+fields[i][1], question_slices[fields_question_slices_argmax[i]]))
|
|
||||||
else:
|
|
||||||
fields_question_slices_pair.append((fields[i][1], question_slices[fields_question_slices_argmax[i]]))
|
|
||||||
|
|
||||||
logger.debug(f'fields_question_slices_pair: {fields_question_slices_pair}')
|
|
||||||
else:
|
|
||||||
fields_question_slices_pair = []
|
|
||||||
|
|
||||||
return fields_question_slices_pair
|
|
||||||
|
|
||||||
|
|
||||||
def construct_schema_linking_cot(question:str, fields_question_slices_pair:List[Tuple[str,str]], literals_list:List[str]):
|
|
||||||
cot_intro= """Let’s think step by step. In the question "{question}", we are asked:""".format(question=question)
|
|
||||||
|
|
||||||
schema_linkings_list = []
|
|
||||||
|
|
||||||
fields_cot_template = """"{question_slice}" so we need column = [{field}]"""
|
|
||||||
fields_cot_list = []
|
|
||||||
for field, question_slice in fields_question_slices_pair:
|
|
||||||
fields_cot_list.append(fields_cot_template.format(question_slice=question_slice, field=field))
|
|
||||||
schema_linkings_list.append(field)
|
|
||||||
fields_cot = '\n'.join(fields_cot_list)
|
|
||||||
|
|
||||||
literals_cot_template = """Based on the tables, columns, and Foreign_keys, The set of possible cell values are = [{literals}]. So the Schema_links are:"""
|
|
||||||
literals_cot = literals_cot_template.format(literals=",".join(literals_list))
|
|
||||||
|
|
||||||
schema_linkings_list += literals_list
|
|
||||||
schema_linking_str = '[' + ",".join(schema_linkings_list) + ']'
|
|
||||||
schema_linkings = 'Schema_links: '+ schema_linking_str
|
|
||||||
|
|
||||||
cot = """{cot_intro}""".format(cot_intro=cot_intro)
|
|
||||||
if len(fields_cot_list) > 0:
|
|
||||||
cot += '\n' + fields_cot
|
|
||||||
|
|
||||||
cot += '\n' + literals_cot
|
|
||||||
cot += '\n' + schema_linkings
|
|
||||||
|
|
||||||
return cot, schema_linking_str
|
|
||||||
|
|
||||||
def auto_cot_run(question, sql, min_window_size, max_window_size):
|
|
||||||
sql_entity = sql2schema_linking(sql)
|
|
||||||
logger.debug(f'sql_entity: {sql_entity}')
|
|
||||||
|
|
||||||
fields = sql_entity['fields']
|
|
||||||
literals = sql_entity['literals']
|
|
||||||
|
|
||||||
field_linked_pairs = schema_linking_match(fields, question, min_window_size, max_window_size)
|
|
||||||
logger.debug(f'field_linked_pairs: {field_linked_pairs}')
|
|
||||||
|
|
||||||
auto_schema_linking_cot, auto_schema_linkings = construct_schema_linking_cot(question, field_linked_pairs, literals)
|
|
||||||
logger.debug(f'auto_schema_linking_cot: {auto_schema_linking_cot}')
|
|
||||||
logger.debug(f'auto_schema_linkings: {auto_schema_linkings}')
|
|
||||||
|
|
||||||
return auto_schema_linking_cot, auto_schema_linkings
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
question = "没有获得过奖项的高校有哪几所?"
|
|
||||||
sql = "select 名称 from 高校 where 词条id not in ( select 高校id from 奖项 )"
|
|
||||||
min_window_size = 6
|
|
||||||
max_window_size = 10
|
|
||||||
|
|
||||||
generated_schema_linking_cot, generated_schema_linkings = auto_cot_run(question, sql, min_window_size, max_window_size)
|
|
||||||
@@ -1,105 +0,0 @@
|
|||||||
# -*- coding:utf-8 -*-
|
|
||||||
|
|
||||||
import os
|
|
||||||
import sys
|
|
||||||
from typing import Any, Dict, List, Union, Mapping
|
|
||||||
|
|
||||||
from git import Optional
|
|
||||||
|
|
||||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
|
|
||||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
||||||
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
|
||||||
|
|
||||||
from instances.logging_instance import logger
|
|
||||||
|
|
||||||
from auto_cot import auto_cot_run
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def transform_sql_example(question:str, current_date:str, table_name:str, field_list: Union[str, List[str]], prior_linkings: Union[str, Mapping[str,str]], prior_exts:str, sql:str=None, terms_list: Optional[List[Dict]] = []):
|
|
||||||
db_schema = f"Table: {table_name}, Columns = {field_list}\nForeign_keys: []"
|
|
||||||
|
|
||||||
prior_linkings_pairs = []
|
|
||||||
if isinstance(prior_linkings, str):
|
|
||||||
prior_linkings = prior_linkings.strip('[]')
|
|
||||||
if prior_linkings.strip() == '':
|
|
||||||
prior_linkings = []
|
|
||||||
else:
|
|
||||||
prior_linkings = prior_linkings.split(',')
|
|
||||||
logger.debug(f'prior_linkings: {prior_linkings}')
|
|
||||||
|
|
||||||
for prior_linking in prior_linkings:
|
|
||||||
logger.debug(f'prior_linking: {prior_linking}')
|
|
||||||
entity_value, entity_type = prior_linking.split('->')
|
|
||||||
entity_linking = """’{}‘是一个’{}‘""".format(entity_value, entity_type)
|
|
||||||
prior_linkings_pairs.append(entity_linking)
|
|
||||||
elif isinstance(prior_linkings, Mapping):
|
|
||||||
for entity_value, entity_type in prior_linkings.items():
|
|
||||||
entity_linking = """’{}‘是一个’{}‘""".format(entity_value, entity_type)
|
|
||||||
prior_linkings_pairs.append(entity_linking)
|
|
||||||
|
|
||||||
prior_linkings_str = ','.join(prior_linkings_pairs)
|
|
||||||
|
|
||||||
current_data_str = """当前的日期是{}""".format(current_date)
|
|
||||||
|
|
||||||
terms_desc = ''
|
|
||||||
|
|
||||||
if len(terms_list) > 0:
|
|
||||||
terms_desc += "相关业务术语:"
|
|
||||||
for idx, term in enumerate(terms_list):
|
|
||||||
|
|
||||||
if (term['description'] is not None and len(term['description']) > 0) and (term['alias'] is not None and len(term['alias']) > 0):
|
|
||||||
terms_desc += f"""{idx+1}.<{term['name']}>是业务术语,它通常是指<{term['description']}>,类似的表达还有{term['alias']};"""
|
|
||||||
elif (term['description'] is None or len(term['description']) == 0) and (term['alias'] is not None and len(term['alias']) > 0):
|
|
||||||
terms_desc += f"""{idx+1}.<{term['name']}>是业务术语,类似的表达还有{term['alias']};"""
|
|
||||||
elif (term['description'] is not None and len(term['description']) > 0) and (term['alias'] is None or len(term['alias']) == 0):
|
|
||||||
terms_desc += f"""{idx+1}.<{term['name']}>是业务术语,它通常是指<{term['description']}>;"""
|
|
||||||
else:
|
|
||||||
terms_desc += f"""{idx+1}.<{term['name']}>是业务术语;"""
|
|
||||||
|
|
||||||
if len(terms_desc) > 0:
|
|
||||||
terms_desc = terms_desc[:-1]
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
question_augmented = """{question} (补充信息:{prior_linking}。{current_date}。{terms_desc}) (备注: {prior_exts})""".format(question=question, prior_linking=prior_linkings_str, prior_exts=prior_exts, current_date=current_data_str, terms_desc=terms_desc)
|
|
||||||
|
|
||||||
return question_augmented, db_schema, sql
|
|
||||||
|
|
||||||
|
|
||||||
def transform_sql_example_autoCoT_run(examplar_list, min_window_size, max_window_size):
|
|
||||||
transformed_sql_examplar_list = []
|
|
||||||
|
|
||||||
for examplar in examplar_list:
|
|
||||||
question = examplar['question']
|
|
||||||
current_date = examplar['currentDate']
|
|
||||||
table_name = examplar['tableName']
|
|
||||||
field_list = examplar['fieldsList']
|
|
||||||
prior_linkings = examplar['priorSchemaLinks']
|
|
||||||
sql = examplar['sql']
|
|
||||||
if 'priorExts' not in examplar:
|
|
||||||
prior_exts = ''
|
|
||||||
else:
|
|
||||||
prior_exts = examplar['priorExts']
|
|
||||||
|
|
||||||
question_augmented, db_schema, sql = transform_sql_example(question=question, current_date=current_date, table_name=table_name, field_list=field_list, prior_linkings=prior_linkings, prior_exts=prior_exts, sql=sql)
|
|
||||||
logger.debug(f'question_augmented: {question_augmented}')
|
|
||||||
logger.debug(f'db_schema: {db_schema}')
|
|
||||||
logger.debug(f'sql: {sql}')
|
|
||||||
|
|
||||||
generated_schema_linking_cot, generated_schema_linkings = auto_cot_run(question_augmented, sql, min_window_size, max_window_size)
|
|
||||||
|
|
||||||
transformed_sql_examplar = dict()
|
|
||||||
transformed_sql_examplar['question'] = question
|
|
||||||
transformed_sql_examplar['questionAugmented'] = question_augmented
|
|
||||||
transformed_sql_examplar['modelName'] = table_name
|
|
||||||
transformed_sql_examplar['dbSchema'] = db_schema
|
|
||||||
transformed_sql_examplar['sql'] = sql
|
|
||||||
transformed_sql_examplar['generatedSchemaLinkingCoT'] = generated_schema_linking_cot
|
|
||||||
transformed_sql_examplar['generatedSchemaLinkings'] = generated_schema_linkings
|
|
||||||
|
|
||||||
logger.debug(f'transformed_sql_examplar: {transformed_sql_examplar}')
|
|
||||||
|
|
||||||
transformed_sql_examplar_list.append(transformed_sql_examplar)
|
|
||||||
|
|
||||||
return transformed_sql_examplar_list
|
|
||||||
@@ -1,79 +0,0 @@
|
|||||||
# -*- coding:utf-8 -*-
|
|
||||||
import os
|
|
||||||
import sys
|
|
||||||
from typing import List, Mapping
|
|
||||||
from chromadb.api import Collection
|
|
||||||
|
|
||||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
||||||
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
|
||||||
|
|
||||||
from instances.logging_instance import logger
|
|
||||||
from services.query_retrieval.retriever import ChromaCollectionRetriever
|
|
||||||
|
|
||||||
class FewShotPromptTemplate2(object):
|
|
||||||
def __init__(self, collection:Collection, retrieval_key:str, few_shot_seperator:str = "\n\n") -> None:
|
|
||||||
self.collection = collection
|
|
||||||
self.few_shot_retriever = ChromaCollectionRetriever(self.collection)
|
|
||||||
|
|
||||||
self.retrieval_key = retrieval_key
|
|
||||||
|
|
||||||
self.few_shot_seperator = few_shot_seperator
|
|
||||||
|
|
||||||
def add_few_shot_example(self, example_ids: List[str] , example_units: List[Mapping[str, str]])-> None:
|
|
||||||
query_text_list = []
|
|
||||||
|
|
||||||
for idx, example_unit in enumerate(example_units):
|
|
||||||
query_text_list.append(example_unit[self.retrieval_key])
|
|
||||||
|
|
||||||
self.few_shot_retriever.add_queries(query_text_list=query_text_list, query_id_list=example_ids, metadatas=example_units)
|
|
||||||
|
|
||||||
def update_few_shot_example(self, example_ids: List[str] , example_units: List[Mapping[str, str]])-> None:
|
|
||||||
query_text_list = []
|
|
||||||
|
|
||||||
for idx, example_unit in enumerate(example_units):
|
|
||||||
query_text_list.append(example_unit[self.retrieval_key])
|
|
||||||
|
|
||||||
self.few_shot_retriever.update_queries(query_text_list=query_text_list, query_id_list=example_ids, metadatas=example_units)
|
|
||||||
|
|
||||||
def delete_few_shot_example(self, example_ids: List[str])-> None:
|
|
||||||
self.few_shot_retriever.delete_queries_by_ids(query_ids=example_ids)
|
|
||||||
|
|
||||||
def get_few_shot_example(self, example_ids: List[str]):
|
|
||||||
return self.few_shot_retriever.get_query_by_ids(query_ids=example_ids)
|
|
||||||
|
|
||||||
def count_few_shot_example(self)-> int:
|
|
||||||
return self.few_shot_retriever.get_query_size()
|
|
||||||
|
|
||||||
def reload_few_shot_example(self, example_ids: List[str] , example_units: List[Mapping[str, str]])-> None:
|
|
||||||
logger.info(f"original {self.collection.name} size: {self.few_shot_retriever.get_query_size()}")
|
|
||||||
|
|
||||||
self.few_shot_retriever.empty_query_collection()
|
|
||||||
logger.info(f"emptied {self.collection.name} size: {self.few_shot_retriever.get_query_size()}")
|
|
||||||
|
|
||||||
self.add_few_shot_example(example_ids=example_ids, example_units=example_units)
|
|
||||||
logger.info(f"reloaded {self.collection.name} size: {self.few_shot_retriever.get_query_size()}")
|
|
||||||
|
|
||||||
def _sub_dict(self, d:Mapping[str, str], keys:List[str])-> Mapping[str, str]:
|
|
||||||
return {k:d[k] for k in keys if k in d}
|
|
||||||
|
|
||||||
def retrieve_few_shot_example(self, query_text: str, retrieval_num: int, filter_condition: Mapping[str,str] =None)-> List[Mapping[str, str]]:
|
|
||||||
query_text_list = [query_text]
|
|
||||||
retrieval_res_list = self.few_shot_retriever.retrieval_query_run(query_texts_list=query_text_list,
|
|
||||||
filter_condition=filter_condition, n_results=retrieval_num)
|
|
||||||
retrieval_res_unit_list = retrieval_res_list[0]['retrieval']
|
|
||||||
|
|
||||||
return retrieval_res_unit_list
|
|
||||||
|
|
||||||
def make_few_shot_example_prompt(self, few_shot_template: str, example_keys: List[str],
|
|
||||||
few_shot_example_meta_list: List[Mapping[str, str]])-> str:
|
|
||||||
few_shot_example_str_unit_list = []
|
|
||||||
|
|
||||||
retrieval_metas_list = [self._sub_dict(few_shot_example_meta['metadata'], example_keys) for few_shot_example_meta in few_shot_example_meta_list]
|
|
||||||
|
|
||||||
for meta in retrieval_metas_list:
|
|
||||||
few_shot_example_str_unit_list.append(few_shot_template.format(**meta))
|
|
||||||
|
|
||||||
few_shot_example_str = self.few_shot_seperator.join(few_shot_example_str_unit_list)
|
|
||||||
|
|
||||||
return few_shot_example_str
|
|
||||||
|
|
||||||
@@ -1,40 +0,0 @@
|
|||||||
# -*- coding:utf-8 -*-
|
|
||||||
import json
|
|
||||||
import os
|
|
||||||
import sys
|
|
||||||
from typing import List, Mapping
|
|
||||||
|
|
||||||
import requests
|
|
||||||
|
|
||||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
|
|
||||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
|
|
||||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
||||||
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
|
||||||
|
|
||||||
from instances.logging_instance import logger
|
|
||||||
from config.config_parse import (
|
|
||||||
TEXT2DSL_EXAMPLE_NUM, TEXT2DSL_FEWSHOTS_NUM, TEXT2DSL_SELF_CONSISTENCY_NUM,
|
|
||||||
LLMPARSER_HOST, LLMPARSER_PORT,)
|
|
||||||
from few_shot_example.s2sql_exemplar import exemplars as sql_exemplars
|
|
||||||
|
|
||||||
|
|
||||||
def text2dsl_agent_wrapper_setting_update(llm_host:str, llm_port:str,
|
|
||||||
sql_examplars:List[Mapping[str, str]],
|
|
||||||
example_nums:int, fewshot_nums:int, self_consistency_nums:int):
|
|
||||||
|
|
||||||
sql_ids = [str(i) for i in range(0, len(sql_examplars))]
|
|
||||||
|
|
||||||
url = f"http://{llm_host}:{llm_port}/query2sql_setting_update"
|
|
||||||
payload = {
|
|
||||||
"sqlExamplars":sql_examplars, "sqlIds": sql_ids,
|
|
||||||
"exampleNums":example_nums, "fewshotNums":fewshot_nums, "selfConsistencyNums":self_consistency_nums
|
|
||||||
}
|
|
||||||
headers = {'content-type': 'application/json'}
|
|
||||||
response = requests.post(url, data=json.dumps(payload), headers=headers)
|
|
||||||
logger.info(response.text)
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
text2dsl_agent_wrapper_setting_update(LLMPARSER_HOST,LLMPARSER_PORT,
|
|
||||||
sql_exemplars, TEXT2DSL_EXAMPLE_NUM, TEXT2DSL_FEWSHOTS_NUM, TEXT2DSL_SELF_CONSISTENCY_NUM)
|
|
||||||
|
|
||||||
|
|
||||||
@@ -1,59 +0,0 @@
|
|||||||
# -*- coding:utf-8 -*-
|
|
||||||
import re
|
|
||||||
|
|
||||||
import os
|
|
||||||
import sys
|
|
||||||
|
|
||||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
|
|
||||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
||||||
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
|
||||||
|
|
||||||
from instances.logging_instance import logger
|
|
||||||
|
|
||||||
|
|
||||||
def schema_link_parse(schema_link_output: str):
|
|
||||||
try:
|
|
||||||
schema_link_output = schema_link_output.strip()
|
|
||||||
pattern = r'Schema_links:(.*)'
|
|
||||||
schema_link_output = re.findall(pattern, schema_link_output, re.DOTALL)[0].strip()
|
|
||||||
except Exception as e:
|
|
||||||
logger.exception(e)
|
|
||||||
schema_link_output = None
|
|
||||||
|
|
||||||
return schema_link_output
|
|
||||||
|
|
||||||
def combo_schema_link_parse(schema_linking_sql_combo_output: str):
|
|
||||||
try:
|
|
||||||
schema_linking_sql_combo_output = schema_linking_sql_combo_output.strip()
|
|
||||||
pattern = r'Schema_links:(\[.*?\])|Schema_links: (\[.*?\])'
|
|
||||||
schema_links_match = re.search(pattern, schema_linking_sql_combo_output)
|
|
||||||
|
|
||||||
if schema_links_match.group(1):
|
|
||||||
schema_links = schema_links_match.group(1)
|
|
||||||
elif schema_links_match.group(2):
|
|
||||||
schema_links = schema_links_match.group(2)
|
|
||||||
else:
|
|
||||||
schema_links = None
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.exception(e)
|
|
||||||
schema_links = None
|
|
||||||
|
|
||||||
return schema_links
|
|
||||||
|
|
||||||
def combo_sql_parse(schema_linking_sql_combo_output: str):
|
|
||||||
try:
|
|
||||||
schema_linking_sql_combo_output = schema_linking_sql_combo_output.strip()
|
|
||||||
pattern = r'SQL:(.*)'
|
|
||||||
sql_match = re.search(pattern, schema_linking_sql_combo_output)
|
|
||||||
|
|
||||||
if sql_match:
|
|
||||||
sql = sql_match.group(1)
|
|
||||||
else:
|
|
||||||
sql = None
|
|
||||||
except Exception as e:
|
|
||||||
logger.exception(e)
|
|
||||||
sql = None
|
|
||||||
|
|
||||||
return sql
|
|
||||||
|
|
||||||
@@ -1,62 +0,0 @@
|
|||||||
# -*- coding:utf-8 -*-
|
|
||||||
|
|
||||||
import asyncio
|
|
||||||
|
|
||||||
import os
|
|
||||||
import sys
|
|
||||||
|
|
||||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
|
|
||||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
||||||
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
|
||||||
|
|
||||||
import json
|
|
||||||
|
|
||||||
from s2sql.constructor import FewShotPromptTemplate2
|
|
||||||
from s2sql.sql_agent import Text2DSLAgent, Text2DSLAgentAutoCoT, Text2DSLAgentWrapper
|
|
||||||
|
|
||||||
from instances.chromadb_instance import client as chromadb_client
|
|
||||||
from instances.logging_instance import logger
|
|
||||||
from instances.text2vec_instance import emb_func
|
|
||||||
|
|
||||||
from few_shot_example.s2sql_exemplar import exemplars as sql_exemplars
|
|
||||||
from config.config_parse import (TEXT2DSLAGENT_COLLECTION_NAME, TEXT2DSLAGENTACT_COLLECTION_NAME,
|
|
||||||
TEXT2DSL_EXAMPLE_NUM, TEXT2DSL_FEWSHOTS_NUM, TEXT2DSL_SELF_CONSISTENCY_NUM,
|
|
||||||
ACT_MIN_WINDOWN_SIZE, ACT_MAX_WINDOWN_SIZE)
|
|
||||||
|
|
||||||
|
|
||||||
text2dsl_agent_collection = chromadb_client.get_or_create_collection(name=TEXT2DSLAGENT_COLLECTION_NAME,
|
|
||||||
embedding_function=emb_func,
|
|
||||||
metadata={"hnsw:space": "cosine"})
|
|
||||||
text2dsl_agent_act_collection = chromadb_client.get_or_create_collection(name=TEXT2DSLAGENTACT_COLLECTION_NAME,
|
|
||||||
embedding_function=emb_func,
|
|
||||||
metadata={"hnsw:space": "cosine"})
|
|
||||||
|
|
||||||
text2dsl_agent_example_prompter = FewShotPromptTemplate2(collection=text2dsl_agent_collection,
|
|
||||||
retrieval_key="question",
|
|
||||||
few_shot_seperator='\n\n')
|
|
||||||
text2dsl_agent_act_example_prompter = FewShotPromptTemplate2(collection=text2dsl_agent_act_collection,
|
|
||||||
retrieval_key="question",
|
|
||||||
few_shot_seperator='\n\n')
|
|
||||||
|
|
||||||
text2sql_agent = Text2DSLAgent(num_fewshots=TEXT2DSL_FEWSHOTS_NUM, num_examples=TEXT2DSL_EXAMPLE_NUM, num_self_consistency=TEXT2DSL_SELF_CONSISTENCY_NUM,
|
|
||||||
sql_example_prompter=text2dsl_agent_example_prompter)
|
|
||||||
text2sql_agent_autoCoT = Text2DSLAgentAutoCoT(num_fewshots=TEXT2DSL_FEWSHOTS_NUM, num_examples=TEXT2DSL_EXAMPLE_NUM, num_self_consistency=TEXT2DSL_SELF_CONSISTENCY_NUM,
|
|
||||||
sql_example_prompter=text2dsl_agent_act_example_prompter,
|
|
||||||
auto_cot_min_window_size=ACT_MIN_WINDOWN_SIZE, auto_cot_max_window_size=ACT_MAX_WINDOWN_SIZE)
|
|
||||||
|
|
||||||
sql_ids = [str(i) for i in range(0, len(sql_exemplars))]
|
|
||||||
text2sql_agent.reload_setting(sql_ids, sql_exemplars, TEXT2DSL_EXAMPLE_NUM, TEXT2DSL_FEWSHOTS_NUM, TEXT2DSL_SELF_CONSISTENCY_NUM)
|
|
||||||
|
|
||||||
if text2sql_agent_autoCoT.count_examples()==0:
|
|
||||||
source_dir_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
||||||
example_dir_path = os.path.join(source_dir_path, 'few_shot_example')
|
|
||||||
example_json_file = os.path.join(example_dir_path, 's2sql_exemplar3_transformed.json')
|
|
||||||
with open(example_json_file, 'r', encoding='utf-8') as f:
|
|
||||||
transformed_sql_examplar_list = json.load(f)
|
|
||||||
|
|
||||||
transformed_sql_examplar_ids = [str(i) for i in range(0, len(transformed_sql_examplar_list))]
|
|
||||||
text2sql_agent_autoCoT.reload_setting_autoCoT(transformed_sql_examplar_ids, transformed_sql_examplar_list, TEXT2DSL_EXAMPLE_NUM, TEXT2DSL_FEWSHOTS_NUM, TEXT2DSL_SELF_CONSISTENCY_NUM)
|
|
||||||
|
|
||||||
|
|
||||||
text2sql_agent_router = Text2DSLAgentWrapper(sql_agent_act=text2sql_agent_autoCoT)
|
|
||||||
|
|
||||||
@@ -1,836 +0,0 @@
|
|||||||
import os
|
|
||||||
import sys
|
|
||||||
from typing import Dict, List, Optional, Union, Mapping, Any
|
|
||||||
from collections import Counter
|
|
||||||
import random
|
|
||||||
import asyncio
|
|
||||||
from enum import Enum
|
|
||||||
|
|
||||||
from langchain.llms.base import BaseLLM
|
|
||||||
|
|
||||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
|
|
||||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
||||||
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
|
||||||
|
|
||||||
from instances.logging_instance import logger
|
|
||||||
|
|
||||||
from s2sql.constructor import FewShotPromptTemplate2
|
|
||||||
from s2sql.output_parser import schema_link_parse, combo_schema_link_parse, combo_sql_parse
|
|
||||||
from s2sql.auto_cot_run import transform_sql_example, transform_sql_example_autoCoT_run
|
|
||||||
from instances.llm_instance import get_llm
|
|
||||||
|
|
||||||
|
|
||||||
class Text2DSLAgentBase(object):
|
|
||||||
def __init__(self, num_fewshots:int, num_examples:int, num_self_consistency:int,
|
|
||||||
sql_example_prompter:FewShotPromptTemplate2) -> None:
|
|
||||||
self.num_fewshots = num_fewshots
|
|
||||||
self.num_examples = num_examples
|
|
||||||
assert self.num_fewshots <= self.num_examples
|
|
||||||
self.num_self_consistency = num_self_consistency
|
|
||||||
|
|
||||||
self.sql_example_prompter = sql_example_prompter
|
|
||||||
|
|
||||||
def get_examples_candidates(self, question: str, filter_condition: Mapping[str, str], num_examples: int)->List[Mapping[str, str]]:
|
|
||||||
few_shot_example_meta_list = self.sql_example_prompter.retrieve_few_shot_example(question, num_examples, filter_condition)
|
|
||||||
|
|
||||||
if len(few_shot_example_meta_list) == num_examples:
|
|
||||||
return few_shot_example_meta_list
|
|
||||||
elif len(few_shot_example_meta_list) < num_examples:
|
|
||||||
logger.info(f"few_shot_example_meta_list size: {len(few_shot_example_meta_list)} < num_examples: {num_examples}")
|
|
||||||
existed_id_set = set([item['id'] for item in few_shot_example_meta_list])
|
|
||||||
extra_few_shot_example_meta_list = self.sql_example_prompter.retrieve_few_shot_example(query_text=question, retrieval_num=num_examples, filter_condition=None)
|
|
||||||
|
|
||||||
for item in extra_few_shot_example_meta_list:
|
|
||||||
if item['id'] not in existed_id_set:
|
|
||||||
few_shot_example_meta_list.append(item)
|
|
||||||
existed_id_set.add(item['id'])
|
|
||||||
if len(few_shot_example_meta_list) == num_examples:
|
|
||||||
break
|
|
||||||
|
|
||||||
logger.info(f"few_shot_example_meta_list size: {len(few_shot_example_meta_list)} = num_examples: {num_examples}")
|
|
||||||
return few_shot_example_meta_list
|
|
||||||
else:
|
|
||||||
logger.info(f"few_shot_example_meta_list size: {len(few_shot_example_meta_list)} > num_examples: {num_examples}")
|
|
||||||
few_shot_example_meta_list = few_shot_example_meta_list[:num_examples]
|
|
||||||
return few_shot_example_meta_list
|
|
||||||
|
|
||||||
def get_fewshot_example_combos(self, example_meta_list:List[Mapping[str, str]], num_fewshots:int)-> List[List[Mapping[str, str]]]:
|
|
||||||
fewshot_example_list = []
|
|
||||||
for i in range(0, self.num_self_consistency):
|
|
||||||
random.shuffle(example_meta_list)
|
|
||||||
fewshot_example_list.append(example_meta_list[:num_fewshots])
|
|
||||||
|
|
||||||
return fewshot_example_list
|
|
||||||
|
|
||||||
def self_consistency_vote(self, output_res_pool:List[str]):
|
|
||||||
output_res_counts = Counter(output_res_pool)
|
|
||||||
output_res_max = output_res_counts.most_common(1)[0][0]
|
|
||||||
total_output_num = len(output_res_pool)
|
|
||||||
|
|
||||||
vote_percentage = {k: (v/total_output_num) for k,v in output_res_counts.items()}
|
|
||||||
|
|
||||||
return output_res_max, vote_percentage
|
|
||||||
|
|
||||||
def schema_linking_list_str_unify(self, schema_linking_list: List[str])-> List[str]:
|
|
||||||
schema_linking_list_unify = []
|
|
||||||
for schema_linking_str in schema_linking_list:
|
|
||||||
schema_linking_str_unify = ','.join(sorted([item.strip() for item in schema_linking_str.strip('[]').split(',')]))
|
|
||||||
schema_linking_str_unify = f'[{schema_linking_str_unify}]'
|
|
||||||
schema_linking_list_unify.append(schema_linking_str_unify)
|
|
||||||
|
|
||||||
return schema_linking_list_unify
|
|
||||||
|
|
||||||
class Text2DSLAgentAutoCoT(Text2DSLAgentBase):
|
|
||||||
def __init__(self, num_fewshots:int, num_examples:int, num_self_consistency:int,
|
|
||||||
sql_example_prompter:FewShotPromptTemplate2,
|
|
||||||
auto_cot_min_window_size: int, auto_cot_max_window_size: int):
|
|
||||||
super().__init__(num_fewshots, num_examples, num_self_consistency, sql_example_prompter)
|
|
||||||
|
|
||||||
assert auto_cot_min_window_size <= auto_cot_max_window_size
|
|
||||||
self.auto_cot_min_window_size = auto_cot_min_window_size
|
|
||||||
self.auto_cot_max_window_size = auto_cot_max_window_size
|
|
||||||
|
|
||||||
def reload_setting(self, sql_example_ids: List[str], sql_example_units: List[Mapping[str,str]], num_examples:int, num_fewshots:int, num_self_consistency:int):
|
|
||||||
self.num_fewshots = num_fewshots
|
|
||||||
self.num_examples = num_examples
|
|
||||||
assert self.num_fewshots <= self.num_examples
|
|
||||||
self.num_self_consistency = num_self_consistency
|
|
||||||
assert self.num_self_consistency >= 1
|
|
||||||
|
|
||||||
new_sql_example_unit_list = transform_sql_example_autoCoT_run(sql_example_units, self.auto_cot_min_window_size, self.auto_cot_max_window_size)
|
|
||||||
self.sql_example_prompter.reload_few_shot_example(sql_example_ids, new_sql_example_unit_list)
|
|
||||||
|
|
||||||
def reload_setting_autoCoT(self, sql_example_ids: List[str], auto_cot_sql_example_units: List[Mapping[str,str]], num_examples:int, num_fewshots:int, num_self_consistency:int):
|
|
||||||
self.num_fewshots = num_fewshots
|
|
||||||
self.num_examples = num_examples
|
|
||||||
assert self.num_fewshots <= self.num_examples
|
|
||||||
self.num_self_consistency = num_self_consistency
|
|
||||||
assert self.num_self_consistency >= 1
|
|
||||||
|
|
||||||
self.sql_example_prompter.reload_few_shot_example(sql_example_ids, auto_cot_sql_example_units)
|
|
||||||
|
|
||||||
def add_examples(self, sql_example_ids: List[str], sql_example_units: List[Mapping[str,str]]):
|
|
||||||
new_sql_example_unit_list = transform_sql_example_autoCoT_run(sql_example_units, self.auto_cot_min_window_size, self.auto_cot_max_window_size)
|
|
||||||
self.sql_example_prompter.add_few_shot_example(sql_example_ids, new_sql_example_unit_list)
|
|
||||||
|
|
||||||
def update_examples(self, sql_example_ids: List[str], sql_example_units: List[Mapping[str,str]]):
|
|
||||||
new_sql_example_unit_list = transform_sql_example_autoCoT_run(sql_example_units, self.auto_cot_min_window_size, self.auto_cot_max_window_size)
|
|
||||||
self.sql_example_prompter.update_few_shot_example(sql_example_ids, new_sql_example_unit_list)
|
|
||||||
|
|
||||||
def delete_examples(self, sql_example_ids: List[str]):
|
|
||||||
self.sql_example_prompter.delete_few_shot_example(sql_example_ids)
|
|
||||||
|
|
||||||
def count_examples(self):
|
|
||||||
return self.sql_example_prompter.count_few_shot_example()
|
|
||||||
|
|
||||||
def get_examples(self, sql_example_ids: List[str]):
|
|
||||||
return self.sql_example_prompter.get_few_shot_example(sql_example_ids)
|
|
||||||
|
|
||||||
def generate_schema_linking_prompt(self, question: str, current_date:str, domain_name: str, fields_list: List[str],
|
|
||||||
prior_schema_links: Mapping[str,str], prior_exts:str, fewshot_example_list:List[Mapping[str, str]])-> str:
|
|
||||||
|
|
||||||
instruction = "# Find the schema_links for generating SQL queries for each question based on the database schema and Foreign keys."
|
|
||||||
|
|
||||||
schema_linking_example_keys = ["questionAugmented", "dbSchema", "generatedSchemaLinkingCoT"]
|
|
||||||
schema_linking_example_template = "{dbSchema}\nQ: {questionAugmented}\nA: {generatedSchemaLinkingCoT}"
|
|
||||||
schema_linking_fewshot_prompt = self.sql_example_prompter.make_few_shot_example_prompt(few_shot_template=schema_linking_example_template,
|
|
||||||
example_keys=schema_linking_example_keys,
|
|
||||||
few_shot_example_meta_list=fewshot_example_list)
|
|
||||||
|
|
||||||
question_augmented, db_schema, _ = transform_sql_example(question, current_date, domain_name, fields_list, prior_schema_links, prior_exts)
|
|
||||||
new_case_template = """{dbSchema}\nQ: {questionAugmented1}\nA: Let’s think step by step. In the question "{questionAugmented2}", we are asked:"""
|
|
||||||
new_case_prompt = new_case_template.format(dbSchema=db_schema, questionAugmented1=question_augmented, questionAugmented2=question_augmented)
|
|
||||||
|
|
||||||
schema_linking_prompt = instruction + '\n\n' + schema_linking_fewshot_prompt + '\n\n' + new_case_prompt
|
|
||||||
|
|
||||||
logger.info(f'schema_linking_prompt: {schema_linking_prompt}')
|
|
||||||
return schema_linking_prompt
|
|
||||||
|
|
||||||
|
|
||||||
def generate_schema_linking_prompt_pool(self, question: str, current_date:str, domain_name: str, fields_list: List[str],
|
|
||||||
prior_schema_links: Mapping[str,str], prior_exts:str, fewshot_example_list_pool:List[List[Mapping[str, str]]])-> List[str]:
|
|
||||||
schema_linking_prompt_pool = []
|
|
||||||
for fewshot_example_list in fewshot_example_list_pool:
|
|
||||||
schema_linking_prompt = self.generate_schema_linking_prompt(question, current_date, domain_name, fields_list, prior_schema_links, prior_exts, fewshot_example_list)
|
|
||||||
schema_linking_prompt_pool.append(schema_linking_prompt)
|
|
||||||
|
|
||||||
return schema_linking_prompt_pool
|
|
||||||
|
|
||||||
def generate_sql_prompt(self, question: str, domain_name: str,fields_list: List[str],
|
|
||||||
schema_link_str: str, current_date: str, prior_schema_links: Mapping[str,str], prior_exts:str,
|
|
||||||
fewshot_example_list:List[Mapping[str, str]], terms_list: Optional[List[Dict]] = [])-> str:
|
|
||||||
|
|
||||||
instruction = "# Use the the schema links to generate the SQL queries for each of the questions."
|
|
||||||
sql_example_keys = ["questionAugmented", "dbSchema", "generatedSchemaLinkings", "sql"]
|
|
||||||
sql_example_template = "{dbSchema}\nQ: {questionAugmented}\nSchema_links: {generatedSchemaLinkings}\nSQL: {sql}"
|
|
||||||
|
|
||||||
sql_example_fewshot_prompt = self.sql_example_prompter.make_few_shot_example_prompt(few_shot_template=sql_example_template,
|
|
||||||
example_keys=sql_example_keys,
|
|
||||||
few_shot_example_meta_list=fewshot_example_list)
|
|
||||||
|
|
||||||
question_augmented, db_schema, _ = transform_sql_example(question, current_date, domain_name, fields_list, prior_schema_links, prior_exts, terms_list=terms_list)
|
|
||||||
new_case_template = "{dbSchema}\nQ: {questionAugmented}\nSchema_links: {schemaLinkings}\nSQL: "
|
|
||||||
new_case_prompt = new_case_template.format(dbSchema=db_schema, questionAugmented=question_augmented, schemaLinkings=schema_link_str)
|
|
||||||
|
|
||||||
sql_example_prompt = instruction + '\n\n' + sql_example_fewshot_prompt + '\n\n' + new_case_prompt
|
|
||||||
|
|
||||||
logger.info(f'sql_example_prompt: {sql_example_prompt}')
|
|
||||||
return sql_example_prompt
|
|
||||||
|
|
||||||
def generate_sql_prompt_pool(self, question: str, domain_name: str,fields_list: List[str],
|
|
||||||
schema_link_str_pool: List[str], current_date: str, prior_schema_links: Mapping[str,str], prior_exts:str,
|
|
||||||
fewshot_example_list_pool:List[List[Mapping[str, str]]], terms_list: Optional[List[Dict]] = [])-> List[str]:
|
|
||||||
sql_prompt_pool = []
|
|
||||||
for schema_link_str, fewshot_example_list in zip(schema_link_str_pool, fewshot_example_list_pool):
|
|
||||||
sql_prompt = self.generate_sql_prompt(question, domain_name, fields_list, schema_link_str, current_date, prior_schema_links, prior_exts, fewshot_example_list, terms_list=terms_list)
|
|
||||||
sql_prompt_pool.append(sql_prompt)
|
|
||||||
|
|
||||||
return sql_prompt_pool
|
|
||||||
|
|
||||||
def generate_schema_linking_sql_prompt(self, question: str, current_date:str, domain_name: str, fields_list: List[str],
|
|
||||||
prior_schema_links: Mapping[str,str], prior_exts:str, fewshot_example_list:List[Mapping[str, str]], terms_list: Optional[List[Dict]] = []):
|
|
||||||
|
|
||||||
instruction = "# Find the schema_links for generating SQL queries for each question based on the database schema and Foreign keys. Then use the the schema links to generate the SQL queries for each of the questions."
|
|
||||||
|
|
||||||
example_keys = ["questionAugmented", "dbSchema", "generatedSchemaLinkingCoT","sql"]
|
|
||||||
example_template = "{dbSchema}\nQ: {questionAugmented}\nA: {generatedSchemaLinkingCoT}\nSQL: {sql}\n"
|
|
||||||
fewshot_prompt = self.sql_example_prompter.make_few_shot_example_prompt(few_shot_template=example_template,
|
|
||||||
example_keys=example_keys,
|
|
||||||
few_shot_example_meta_list=fewshot_example_list)
|
|
||||||
|
|
||||||
|
|
||||||
question_augmented, db_schema, _ = transform_sql_example(question, current_date, domain_name, fields_list, prior_schema_links, prior_exts, terms_list)
|
|
||||||
new_case_template = """{dbSchema}\nQ: {questionAugmented1}\nA: Let’s think step by step. In the question "{questionAugmented2}", we are asked:"""
|
|
||||||
new_case_prompt = new_case_template.format(dbSchema=db_schema, questionAugmented1=question_augmented, questionAugmented2=question_augmented)
|
|
||||||
|
|
||||||
prompt = instruction + '\n\n' + fewshot_prompt + '\n\n' + new_case_prompt
|
|
||||||
|
|
||||||
logger.info(f'schema_linking_sql_prompt: {prompt}')
|
|
||||||
return prompt
|
|
||||||
|
|
||||||
def generate_schema_linking_sql_prompt_pool(self, question: str, current_date:str, domain_name: str, fields_list: List[str],
|
|
||||||
prior_schema_links: Mapping[str,str], prior_exts:str, fewshot_example_list_pool:List[List[Mapping[str, str]]], terms_list: Optional[List[Dict]] = [])-> List[str]:
|
|
||||||
schema_linking_sql_prompt_pool = []
|
|
||||||
for fewshot_example_list in fewshot_example_list_pool:
|
|
||||||
schema_linking_sql_prompt = self.generate_schema_linking_sql_prompt(question, current_date, domain_name, fields_list, prior_schema_links, prior_exts, fewshot_example_list, terms_list=terms_list)
|
|
||||||
schema_linking_sql_prompt_pool.append(schema_linking_sql_prompt)
|
|
||||||
|
|
||||||
return schema_linking_sql_prompt_pool
|
|
||||||
|
|
||||||
async def async_query2sql(self, question: str, filter_condition: Mapping[str,str],
|
|
||||||
model_name: str, fields_list: List[str],
|
|
||||||
current_date: str, prior_schema_links: Mapping[str,str], prior_exts: str,
|
|
||||||
llm_config:dict, terms_list: Optional[List[Dict]] = []):
|
|
||||||
logger.info("question: {}".format(question))
|
|
||||||
logger.info("filter_condition: {}".format(filter_condition))
|
|
||||||
logger.info("model_name: {}".format(model_name))
|
|
||||||
logger.info("fields_list: {}".format(fields_list))
|
|
||||||
logger.info("current_date: {}".format(current_date))
|
|
||||||
logger.info("prior_schema_links: {}".format(prior_schema_links))
|
|
||||||
logger.info("prior_exts: {}".format(prior_exts))
|
|
||||||
logger.info("terms_list: {}".format(terms_list))
|
|
||||||
|
|
||||||
|
|
||||||
fewshot_example_meta_list = self.get_examples_candidates(question, filter_condition, self.num_examples)
|
|
||||||
schema_linking_prompt = self.generate_schema_linking_prompt(question, current_date, model_name, fields_list, prior_schema_links, prior_exts, fewshot_example_meta_list)
|
|
||||||
logger.debug("schema_linking_prompt->{}".format(schema_linking_prompt))
|
|
||||||
llm = get_llm(llm_config)
|
|
||||||
schema_link_output = await llm._call_async(schema_linking_prompt)
|
|
||||||
logger.debug("schema_link_output->{}".format(schema_link_output))
|
|
||||||
|
|
||||||
schema_link_str = schema_link_parse(schema_link_output)
|
|
||||||
logger.debug("schema_link_str->{}".format(schema_link_str))
|
|
||||||
|
|
||||||
sql_prompt = self.generate_sql_prompt(question, model_name, fields_list, schema_link_str, current_date, prior_schema_links, prior_exts, fewshot_example_meta_list, terms_list=terms_list)
|
|
||||||
logger.debug("sql_prompt->{}".format(sql_prompt))
|
|
||||||
sql_output = await llm._call_async(sql_prompt)
|
|
||||||
|
|
||||||
resp = dict()
|
|
||||||
resp['question'] = question
|
|
||||||
resp['model'] = model_name
|
|
||||||
resp['fields'] = fields_list
|
|
||||||
resp['priorSchemaLinking'] = prior_schema_links
|
|
||||||
resp['priorExts'] = prior_exts
|
|
||||||
resp['currentDate'] = current_date
|
|
||||||
|
|
||||||
resp['prompt'] = [schema_linking_prompt+'\n\n'+sql_prompt]
|
|
||||||
|
|
||||||
resp['schemaLinkingOutput'] = schema_link_output
|
|
||||||
resp['schemaLinkStr'] = schema_link_str
|
|
||||||
|
|
||||||
resp['sqlOutput'] = sql_output
|
|
||||||
|
|
||||||
logger.info("resp: {}".format(resp))
|
|
||||||
|
|
||||||
return resp
|
|
||||||
|
|
||||||
async def async_query2sql_shortcut(self, question: str, filter_condition: Mapping[str,str],
|
|
||||||
model_name: str, fields_list: List[str],
|
|
||||||
current_date: str, prior_schema_links: Mapping[str,str], prior_exts: str,
|
|
||||||
llm_config:dict, terms_list: Optional[List[Dict]] = []):
|
|
||||||
logger.info("question: {}".format(question))
|
|
||||||
logger.info("filter_condition: {}".format(filter_condition))
|
|
||||||
logger.info("model_name: {}".format(model_name))
|
|
||||||
logger.info("fields_list: {}".format(fields_list))
|
|
||||||
logger.info("current_date: {}".format(current_date))
|
|
||||||
logger.info("prior_schema_links: {}".format(prior_schema_links))
|
|
||||||
logger.info("prior_exts: {}".format(prior_exts))
|
|
||||||
logger.info("terms_list: {}".format(terms_list))
|
|
||||||
|
|
||||||
fewshot_example_meta_list = self.get_examples_candidates(question, filter_condition, self.num_examples)
|
|
||||||
schema_linking_sql_shortcut_prompt = self.generate_schema_linking_sql_prompt(question, current_date, model_name, fields_list, prior_schema_links, prior_exts, fewshot_example_meta_list, terms_list)
|
|
||||||
logger.debug("schema_linking_sql_shortcut_prompt->{}".format(schema_linking_sql_shortcut_prompt))
|
|
||||||
llm = get_llm(llm_config)
|
|
||||||
schema_linking_sql_shortcut_output = await llm._call_async(schema_linking_sql_shortcut_prompt)
|
|
||||||
logger.debug("schema_linking_sql_shortcut_output->{}".format(schema_linking_sql_shortcut_output))
|
|
||||||
|
|
||||||
schema_linking_str = combo_schema_link_parse(schema_linking_sql_shortcut_output)
|
|
||||||
sql_str = combo_sql_parse(schema_linking_sql_shortcut_output)
|
|
||||||
|
|
||||||
resp = dict()
|
|
||||||
resp['question'] = question
|
|
||||||
resp['model'] = model_name
|
|
||||||
resp['fields'] = fields_list
|
|
||||||
resp['priorSchemaLinking'] = prior_schema_links
|
|
||||||
resp['priorExts'] = prior_exts
|
|
||||||
resp['currentDate'] = current_date
|
|
||||||
|
|
||||||
resp['prompt'] = [schema_linking_sql_shortcut_prompt]
|
|
||||||
|
|
||||||
resp['schemaLinkingComboOutput'] = schema_linking_sql_shortcut_output
|
|
||||||
resp['schemaLinkStr'] = schema_linking_str
|
|
||||||
resp['sqlOutput'] = sql_str
|
|
||||||
|
|
||||||
logger.info("resp: {}".format(resp))
|
|
||||||
|
|
||||||
return resp
|
|
||||||
|
|
||||||
async def generate_schema_linking_tasks(self, question: str, model_name: str, fields_list: List[str],
|
|
||||||
current_date: str, prior_schema_links: Mapping[str,str], prior_exts: str,
|
|
||||||
fewshot_example_list_combo:List[List[Mapping[str, str]]], llm_config: dict):
|
|
||||||
|
|
||||||
schema_linking_prompt_pool = self.generate_schema_linking_prompt_pool(question, current_date, model_name, fields_list, prior_schema_links, prior_exts, fewshot_example_list_combo)
|
|
||||||
logger.debug("schema_linking_prompt_pool->{}".format(schema_linking_prompt_pool))
|
|
||||||
llm = get_llm(llm_config)
|
|
||||||
schema_linking_output_pool = await asyncio.gather(*[llm._call_async(schema_linking_prompt) for schema_linking_prompt in schema_linking_prompt_pool])
|
|
||||||
logger.debug("schema_linking_output_pool->{}".format(schema_linking_output_pool))
|
|
||||||
|
|
||||||
schema_linking_str_pool = [schema_link_parse(schema_linking_output) for schema_linking_output in schema_linking_output_pool]
|
|
||||||
|
|
||||||
return schema_linking_str_pool, schema_linking_output_pool, schema_linking_prompt_pool
|
|
||||||
|
|
||||||
async def generate_sql_tasks(self, question: str, model_name: str, fields_list: List[str], schema_link_str_pool: List[str],
|
|
||||||
current_date: str, prior_schema_links: Mapping[str,str], prior_exts: str, fewshot_example_list_combo:List[List[Mapping[str, str]]], llm_config: dict, terms_list: Optional[List[Dict]] = []):
|
|
||||||
|
|
||||||
sql_prompt_pool = self.generate_sql_prompt_pool(question, model_name, fields_list, schema_link_str_pool, current_date, prior_schema_links, prior_exts, fewshot_example_list_combo, terms_list=terms_list)
|
|
||||||
logger.debug("sql_prompt_pool->{}".format(sql_prompt_pool))
|
|
||||||
llm = get_llm(llm_config)
|
|
||||||
sql_output_pool = await asyncio.gather(*[llm._call_async(sql_prompt) for sql_prompt in sql_prompt_pool])
|
|
||||||
logger.debug("sql_output_pool->{}".format(sql_output_pool))
|
|
||||||
|
|
||||||
return sql_output_pool, sql_prompt_pool
|
|
||||||
|
|
||||||
async def generate_schema_linking_sql_tasks(self, question: str, model_name: str, fields_list: List[str],
|
|
||||||
current_date: str, prior_schema_links: Mapping[str,str], prior_exts: str,
|
|
||||||
fewshot_example_list_combo:List[List[Mapping[str, str]]],llm_config: dict, terms_list: Optional[List[Dict]] = []):
|
|
||||||
schema_linking_sql_prompt_pool = self.generate_schema_linking_sql_prompt_pool(question, current_date, model_name, fields_list, prior_schema_links, prior_exts, fewshot_example_list_combo, terms_list=terms_list)
|
|
||||||
llm = get_llm(llm_config)
|
|
||||||
schema_linking_sql_output_task_pool = [llm._call_async(schema_linking_sql_prompt) for schema_linking_sql_prompt in schema_linking_sql_prompt_pool]
|
|
||||||
schema_linking_sql_output_res_pool = await asyncio.gather(*schema_linking_sql_output_task_pool)
|
|
||||||
logger.debug("schema_linking_sql_output_res_pool->{}".format(schema_linking_sql_output_res_pool))
|
|
||||||
|
|
||||||
return schema_linking_sql_output_res_pool, schema_linking_sql_prompt_pool, schema_linking_sql_output_task_pool
|
|
||||||
|
|
||||||
async def tasks_run(self, question: str, filter_condition: Mapping[str,str],
|
|
||||||
model_name: str, fields_list: List[str],
|
|
||||||
current_date: str, prior_schema_links: Mapping[str,str], prior_exts: str, llm_config: dict, terms_list: Optional[List[Dict]] = []):
|
|
||||||
logger.info("question: {}".format(question))
|
|
||||||
logger.info("filter_condition: {}".format(filter_condition))
|
|
||||||
logger.info("model_name: {}".format(model_name))
|
|
||||||
logger.info("fields_list: {}".format(fields_list))
|
|
||||||
logger.info("current_date: {}".format(current_date))
|
|
||||||
logger.info("prior_schema_links: {}".format(prior_schema_links))
|
|
||||||
logger.info("prior_exts: {}".format(prior_exts))
|
|
||||||
logger.info("terms_list: {}".format(terms_list))
|
|
||||||
|
|
||||||
|
|
||||||
fewshot_example_meta_list = self.get_examples_candidates(question, filter_condition, self.num_examples)
|
|
||||||
fewshot_example_list_combo = self.get_fewshot_example_combos(fewshot_example_meta_list, self.num_fewshots)
|
|
||||||
|
|
||||||
schema_linking_candidate_list, _, schema_linking_prompt_list = await self.generate_schema_linking_tasks(question, model_name, fields_list, current_date, prior_schema_links, prior_exts, fewshot_example_list_combo, llm_config,)
|
|
||||||
logger.debug(f'schema_linking_candidate_list:{schema_linking_candidate_list}')
|
|
||||||
schema_linking_candidate_sorted_list = self.schema_linking_list_str_unify(schema_linking_candidate_list)
|
|
||||||
logger.debug(f'schema_linking_candidate_sorted_list:{schema_linking_candidate_sorted_list}')
|
|
||||||
|
|
||||||
schema_linking_output_max, schema_linking_output_vote_percentage = self.self_consistency_vote(schema_linking_candidate_sorted_list)
|
|
||||||
|
|
||||||
sql_output_candicates, sql_output_prompt_list = await self.generate_sql_tasks(question, model_name, fields_list, schema_linking_candidate_list, current_date, prior_schema_links, prior_exts, fewshot_example_list_combo, llm_config, terms_list=terms_list)
|
|
||||||
logger.debug(f'sql_output_candicates:{sql_output_candicates}')
|
|
||||||
sql_output_max, sql_output_vote_percentage = self.self_consistency_vote(sql_output_candicates)
|
|
||||||
|
|
||||||
resp = dict()
|
|
||||||
resp['question'] = question
|
|
||||||
resp['model'] = model_name
|
|
||||||
resp['fields'] = fields_list
|
|
||||||
resp['priorSchemaLinking'] = prior_schema_links
|
|
||||||
resp['priorExts'] = prior_exts
|
|
||||||
resp['currentDate'] = current_date
|
|
||||||
|
|
||||||
resp['prompt'] = [schema_linking_prompt+'\n\n'+sql_prompt for schema_linking_prompt, sql_prompt in zip(schema_linking_prompt_list, sql_output_prompt_list)]
|
|
||||||
|
|
||||||
resp['schemaLinkStr'] = schema_linking_output_max
|
|
||||||
resp['schemaLinkingWeight'] = schema_linking_output_vote_percentage
|
|
||||||
|
|
||||||
resp['sqlOutput'] = sql_output_max
|
|
||||||
resp['sqlWeight'] = sql_output_vote_percentage
|
|
||||||
|
|
||||||
logger.info("resp: {}".format(resp))
|
|
||||||
|
|
||||||
return resp
|
|
||||||
|
|
||||||
async def tasks_run_shortcut(self, question: str, filter_condition: Mapping[str,str], model_name: str, fields_list: List[str],
|
|
||||||
current_date: str, prior_schema_links: Mapping[str,str], prior_exts: str, llm_config: dict, terms_list: Optional[List[Dict]] = []):
|
|
||||||
logger.info("question: {}".format(question))
|
|
||||||
logger.info("filter_condition: {}".format(filter_condition))
|
|
||||||
logger.info("model_name: {}".format(model_name))
|
|
||||||
logger.info("fields_list: {}".format(fields_list))
|
|
||||||
logger.info("current_date: {}".format(current_date))
|
|
||||||
logger.info("prior_schema_links: {}".format(prior_schema_links))
|
|
||||||
logger.info("prior_exts: {}".format(prior_exts))
|
|
||||||
logger.info("terms_list: {}".format(terms_list))
|
|
||||||
|
|
||||||
fewshot_example_meta_list = self.get_examples_candidates(question, filter_condition, self.num_examples)
|
|
||||||
fewshot_example_list_combo = self.get_fewshot_example_combos(fewshot_example_meta_list, self.num_fewshots)
|
|
||||||
|
|
||||||
schema_linking_sql_output_candidates, schema_linking_sql_prompt_list, _ = await self.generate_schema_linking_sql_tasks(question, model_name, fields_list, current_date, prior_schema_links, prior_exts, fewshot_example_list_combo, llm_config=llm_config, terms_list=terms_list)
|
|
||||||
logger.debug(f'schema_linking_sql_output_candidates:{schema_linking_sql_output_candidates}')
|
|
||||||
schema_linking_output_candidate_list = [combo_schema_link_parse(schema_linking_sql_output_candidate) for schema_linking_sql_output_candidate in schema_linking_sql_output_candidates]
|
|
||||||
logger.debug(f'schema_linking_sql_output_candidate_list:{schema_linking_output_candidate_list}')
|
|
||||||
schema_linking_output_candidate_sorted_list = self.schema_linking_list_str_unify(schema_linking_output_candidate_list)
|
|
||||||
|
|
||||||
schema_linking_output_max, schema_linking_output_vote_percentage = self.self_consistency_vote(schema_linking_output_candidate_sorted_list)
|
|
||||||
|
|
||||||
sql_output_candidate_list = [combo_sql_parse(schema_linking_sql_output_candidate) for schema_linking_sql_output_candidate in schema_linking_sql_output_candidates]
|
|
||||||
logger.debug(f'sql_output_candidate_list:{sql_output_candidate_list}')
|
|
||||||
sql_output_max, sql_output_vote_percentage = self.self_consistency_vote(sql_output_candidate_list)
|
|
||||||
|
|
||||||
resp = dict()
|
|
||||||
resp['question'] = question
|
|
||||||
resp['model'] = model_name
|
|
||||||
resp['fields'] = fields_list
|
|
||||||
resp['priorSchemaLinking'] = prior_schema_links
|
|
||||||
resp['priorExts'] = prior_exts
|
|
||||||
resp['currentDate'] = current_date
|
|
||||||
|
|
||||||
resp['prompt'] = schema_linking_sql_prompt_list
|
|
||||||
|
|
||||||
resp['schemaLinkStr'] = schema_linking_output_max
|
|
||||||
resp['schemaLinkingWeight'] = schema_linking_output_vote_percentage
|
|
||||||
|
|
||||||
resp['sqlOutput'] = sql_output_max
|
|
||||||
resp['sqlWeight'] = sql_output_vote_percentage
|
|
||||||
|
|
||||||
logger.info("resp: {}".format(resp))
|
|
||||||
|
|
||||||
return resp
|
|
||||||
|
|
||||||
class Text2DSLAgent(Text2DSLAgentBase):
|
|
||||||
def __init__(self, num_fewshots:int, num_examples:int, num_self_consistency:int,
|
|
||||||
sql_example_prompter:FewShotPromptTemplate2) -> None:
|
|
||||||
super().__init__(num_fewshots, num_examples, num_self_consistency, sql_example_prompter)
|
|
||||||
|
|
||||||
def reload_setting(self, sql_example_ids:List[str], sql_example_units: List[Mapping[str, str]], num_examples:int, num_fewshots:int, num_self_consistency:int):
|
|
||||||
self.num_fewshots = num_fewshots
|
|
||||||
self.num_examples = num_examples
|
|
||||||
assert self.num_fewshots <= self.num_examples
|
|
||||||
self.num_self_consistency = num_self_consistency
|
|
||||||
assert self.num_self_consistency >= 1
|
|
||||||
self.sql_example_prompter.reload_few_shot_example(sql_example_ids, sql_example_units)
|
|
||||||
|
|
||||||
def add_examples(self, sql_example_ids:List[str], sql_example_units: List[Mapping[str, str]]):
|
|
||||||
self.sql_example_prompter.add_few_shot_example(sql_example_ids, sql_example_units)
|
|
||||||
|
|
||||||
def update_examples(self, sql_example_ids:List[str], sql_example_units: List[Mapping[str, str]]):
|
|
||||||
self.sql_example_prompter.update_few_shot_example(sql_example_ids, sql_example_units)
|
|
||||||
|
|
||||||
def delete_examples(self, sql_example_ids:List[str]):
|
|
||||||
self.sql_example_prompter.delete_few_shot_example(sql_example_ids)
|
|
||||||
|
|
||||||
def get_examples(self, sql_example_ids: List[str]):
|
|
||||||
return self.sql_example_prompter.get_few_shot_example(sql_example_ids)
|
|
||||||
|
|
||||||
def count_examples(self):
|
|
||||||
return self.sql_example_prompter.count_few_shot_example()
|
|
||||||
|
|
||||||
def generate_schema_linking_prompt(self, question: str, domain_name: str, fields_list: List[str],
|
|
||||||
prior_schema_links: Mapping[str,str], fewshot_example_list:List[Mapping[str, str]])-> str:
|
|
||||||
|
|
||||||
prior_schema_links_str = '['+ ','.join(["""'{}'->{}""".format(k,v) for k,v in prior_schema_links.items()]) + ']'
|
|
||||||
|
|
||||||
instruction = "# 根据数据库的表结构,参考先验信息,找出为每个问题生成SQL查询语句的schema_links"
|
|
||||||
|
|
||||||
schema_linking_example_keys = ["tableName", "fieldsList", "priorSchemaLinks", "question", "analysis", "schemaLinks"]
|
|
||||||
schema_linking_example_template = "Table {tableName}, columns = {fieldsList}, prior_schema_links = {priorSchemaLinks}\n问题:{question}\n分析:{analysis} 所以Schema_links是:\nSchema_links:{schemaLinks}"
|
|
||||||
schema_linking_fewshot_prompt = self.sql_example_prompter.make_few_shot_example_prompt(few_shot_template=schema_linking_example_template,
|
|
||||||
example_keys=schema_linking_example_keys,
|
|
||||||
few_shot_example_meta_list=fewshot_example_list)
|
|
||||||
|
|
||||||
new_case_template = "Table {tableName}, columns = {fieldsList}, prior_schema_links = {priorSchemaLinks}\n问题:{question}\n分析: 让我们一步一步地思考。"
|
|
||||||
new_case_prompt = new_case_template.format(tableName=domain_name, fieldsList=fields_list, priorSchemaLinks=prior_schema_links_str, question=question)
|
|
||||||
|
|
||||||
schema_linking_prompt = instruction + '\n\n' + schema_linking_fewshot_prompt + '\n\n' + new_case_prompt
|
|
||||||
return schema_linking_prompt
|
|
||||||
|
|
||||||
def generate_schema_linking_prompt_pool(self, question: str, domain_name: str, fields_list: List[str],
|
|
||||||
prior_schema_links: Mapping[str,str], fewshot_example_list_pool:List[List[Mapping[str, str]]])-> List[str]:
|
|
||||||
schema_linking_prompt_pool = []
|
|
||||||
for fewshot_example_list in fewshot_example_list_pool:
|
|
||||||
schema_linking_prompt = self.generate_schema_linking_prompt(question, domain_name, fields_list, prior_schema_links, fewshot_example_list)
|
|
||||||
schema_linking_prompt_pool.append(schema_linking_prompt)
|
|
||||||
|
|
||||||
return schema_linking_prompt_pool
|
|
||||||
|
|
||||||
def generate_sql_prompt(self, question: str, domain_name: str,
|
|
||||||
schema_link_str: str, data_date: str,
|
|
||||||
fewshot_example_list:List[Mapping[str, str]])-> str:
|
|
||||||
instruction = "# 根据schema_links为每个问题生成SQL查询语句"
|
|
||||||
sql_example_keys = ["question", "currentDate", "tableName", "schemaLinks", "sql"]
|
|
||||||
sql_example_template = "问题:{question}\nCurrent_date:{currentDate}\nTable {tableName}\nSchema_links:{schemaLinks}\nSQL:{sql}"
|
|
||||||
|
|
||||||
sql_example_fewshot_prompt = self.sql_example_prompter.make_few_shot_example_prompt(few_shot_template=sql_example_template,
|
|
||||||
example_keys=sql_example_keys,
|
|
||||||
few_shot_example_meta_list=fewshot_example_list)
|
|
||||||
|
|
||||||
new_case_template = "问题:{question}\nCurrent_date:{currentDate}\nTable {tableName}\nSchema_links:{schemaLinks}\nSQL:"
|
|
||||||
new_case_prompt = new_case_template.format(question=question, currentDate=data_date, tableName=domain_name, schemaLinks=schema_link_str)
|
|
||||||
|
|
||||||
sql_example_prompt = instruction + '\n\n' + sql_example_fewshot_prompt + '\n\n' + new_case_prompt
|
|
||||||
|
|
||||||
return sql_example_prompt
|
|
||||||
|
|
||||||
def generate_sql_prompt_pool(self, question: str, domain_name: str, data_date: str,
|
|
||||||
schema_link_str_pool: List[str], fewshot_example_list_pool:List[List[Mapping[str, str]]])-> List[str]:
|
|
||||||
sql_prompt_pool = []
|
|
||||||
for schema_link_str, fewshot_example_list in zip(schema_link_str_pool, fewshot_example_list_pool):
|
|
||||||
sql_prompt = self.generate_sql_prompt(question, domain_name, schema_link_str, data_date, fewshot_example_list)
|
|
||||||
sql_prompt_pool.append(sql_prompt)
|
|
||||||
|
|
||||||
return sql_prompt_pool
|
|
||||||
|
|
||||||
def generate_schema_linking_sql_prompt(self, question: str,
|
|
||||||
domain_name: str,
|
|
||||||
data_date : str,
|
|
||||||
fields_list: List[str],
|
|
||||||
prior_schema_links: Mapping[str,str],
|
|
||||||
fewshot_example_list:List[Mapping[str, str]]):
|
|
||||||
|
|
||||||
prior_schema_links_str = '['+ ','.join(["""'{}'->{}""".format(k,v) for k,v in prior_schema_links.items()]) + ']'
|
|
||||||
|
|
||||||
instruction = "# 根据数据库的表结构,参考先验信息,找出为每个问题生成SQL查询语句的schema_links,再根据schema_links为每个问题生成SQL查询语句"
|
|
||||||
|
|
||||||
example_keys = ["tableName", "fieldsList", "priorSchemaLinks", "currentDate", "question", "analysis", "schemaLinks", "sql"]
|
|
||||||
example_template = "Table {tableName}, columns = {fieldsList}, prior_schema_links = {priorSchemaLinks}\nCurrent_date:{currentDate}\n问题:{question}\n分析:{analysis} 所以Schema_links是:\nSchema_links:{schemaLinks}\nSQL:{sql}"
|
|
||||||
fewshot_prompt = self.sql_example_prompter.make_few_shot_example_prompt(few_shot_template=example_template,
|
|
||||||
example_keys=example_keys,
|
|
||||||
few_shot_example_meta_list=fewshot_example_list)
|
|
||||||
|
|
||||||
new_case_template = "Table {tableName}, columns = {fieldsList}, prior_schema_links = {priorSchemaLinks}\nCurrent_date:{currentDate}\n问题:{question}\n分析: 让我们一步一步地思考。"
|
|
||||||
new_case_prompt = new_case_template.format(tableName=domain_name, fieldsList=fields_list, priorSchemaLinks=prior_schema_links_str, currentDate=data_date, question=question)
|
|
||||||
|
|
||||||
prompt = instruction + '\n\n' + fewshot_prompt + '\n\n' + new_case_prompt
|
|
||||||
|
|
||||||
return prompt
|
|
||||||
|
|
||||||
def generate_schema_linking_sql_prompt_pool(self, question: str, domain_name: str, fields_list: List[str], data_date: str,
|
|
||||||
prior_schema_links: Mapping[str,str], fewshot_example_list_pool:List[List[Mapping[str, str]]])-> List[str]:
|
|
||||||
schema_linking_sql_prompt_pool = []
|
|
||||||
for fewshot_example_list in fewshot_example_list_pool:
|
|
||||||
schema_linking_sql_prompt = self.generate_schema_linking_sql_prompt(question, domain_name, data_date, fields_list, prior_schema_links, fewshot_example_list)
|
|
||||||
schema_linking_sql_prompt_pool.append(schema_linking_sql_prompt)
|
|
||||||
|
|
||||||
return schema_linking_sql_prompt_pool
|
|
||||||
|
|
||||||
def self_consistency_vote(self, output_res_pool:List[str]):
|
|
||||||
output_res_counts = Counter(output_res_pool)
|
|
||||||
output_res_max = output_res_counts.most_common(1)[0][0]
|
|
||||||
total_output_num = len(output_res_pool)
|
|
||||||
|
|
||||||
vote_percentage = {k: (v/total_output_num) for k,v in output_res_counts.items()}
|
|
||||||
|
|
||||||
return output_res_max, vote_percentage
|
|
||||||
|
|
||||||
def schema_linking_list_str_unify(self, schema_linking_list: List[str])-> List[str]:
|
|
||||||
schema_linking_list_unify = []
|
|
||||||
for schema_linking_str in schema_linking_list:
|
|
||||||
schema_linking_str_unify = ','.join(sorted([item.strip() for item in schema_linking_str.strip('[]').split(',')]))
|
|
||||||
schema_linking_str_unify = f'[{schema_linking_str_unify}]'
|
|
||||||
schema_linking_list_unify.append(schema_linking_str_unify)
|
|
||||||
|
|
||||||
return schema_linking_list_unify
|
|
||||||
|
|
||||||
async def generate_schema_linking_tasks(self, question: str, domain_name: str,
|
|
||||||
fields_list: List[str], prior_schema_links: Mapping[str,str],
|
|
||||||
fewshot_example_list_combo:List[List[Mapping[str, str]]], llm_config: dict):
|
|
||||||
|
|
||||||
schema_linking_prompt_pool = self.generate_schema_linking_prompt_pool(question, domain_name,
|
|
||||||
fields_list, prior_schema_links,
|
|
||||||
fewshot_example_list_combo)
|
|
||||||
llm = get_llm(llm_config)
|
|
||||||
schema_linking_output_task_pool = [llm._call_async(schema_linking_prompt) for schema_linking_prompt in schema_linking_prompt_pool]
|
|
||||||
schema_linking_output_pool = await asyncio.gather(*schema_linking_output_task_pool)
|
|
||||||
logger.debug(f'schema_linking_output_pool:{schema_linking_output_pool}')
|
|
||||||
|
|
||||||
schema_linking_str_pool = [schema_link_parse(schema_linking_output) for schema_linking_output in schema_linking_output_pool]
|
|
||||||
|
|
||||||
return schema_linking_str_pool
|
|
||||||
|
|
||||||
async def generate_sql_tasks(self, question: str, domain_name: str, data_date: str,
|
|
||||||
schema_link_str_pool: List[str], fewshot_example_list_combo:List[List[Mapping[str, str]]],
|
|
||||||
llm_config: dict):
|
|
||||||
|
|
||||||
sql_prompt_pool = self.generate_sql_prompt_pool(question, domain_name, schema_link_str_pool, data_date, fewshot_example_list_combo)
|
|
||||||
llm = get_llm(llm_config)
|
|
||||||
sql_output_task_pool = [llm._call_async(sql_prompt) for sql_prompt in sql_prompt_pool]
|
|
||||||
sql_output_res_pool = await asyncio.gather(*sql_output_task_pool)
|
|
||||||
logger.debug(f'sql_output_res_pool:{sql_output_res_pool}')
|
|
||||||
|
|
||||||
return sql_output_res_pool
|
|
||||||
|
|
||||||
async def generate_schema_linking_sql_tasks(self, question: str, domain_name: str, fields_list: List[str], data_date: str,
|
|
||||||
prior_schema_links: Mapping[str,str], fewshot_example_list_combo:List[List[Mapping[str, str]]],
|
|
||||||
llm_config: dict):
|
|
||||||
schema_linking_sql_prompt_pool = self.generate_schema_linking_sql_prompt_pool(question, domain_name, fields_list, data_date, prior_schema_links, fewshot_example_list_combo)
|
|
||||||
llm = get_llm(llm_config)
|
|
||||||
schema_linking_sql_output_task_pool = [llm._call_async(schema_linking_sql_prompt) for schema_linking_sql_prompt in schema_linking_sql_prompt_pool]
|
|
||||||
schema_linking_sql_output_res_pool = await asyncio.gather(*schema_linking_sql_output_task_pool)
|
|
||||||
logger.debug(f'schema_linking_sql_output_res_pool:{schema_linking_sql_output_res_pool}')
|
|
||||||
|
|
||||||
return schema_linking_sql_output_res_pool
|
|
||||||
|
|
||||||
async def tasks_run(self, question: str, filter_condition: Mapping[str, str], domain_name: str, fields_list: List[str], prior_schema_links: Mapping[str,str], data_date: str, prior_exts: str, llm_config: dict):
|
|
||||||
logger.info("question: {}".format(question))
|
|
||||||
logger.info("domain_name: {}".format(domain_name))
|
|
||||||
logger.info("fields_list: {}".format(fields_list))
|
|
||||||
logger.info("current_date: {}".format(data_date))
|
|
||||||
logger.info("prior_schema_links: {}".format(prior_schema_links))
|
|
||||||
logger.info("prior_exts: {}".format(prior_exts))
|
|
||||||
|
|
||||||
if prior_exts != '':
|
|
||||||
question = question + ' 备注:'+prior_exts
|
|
||||||
logger.info("question_prior_exts: {}".format(question))
|
|
||||||
|
|
||||||
fewshot_example_meta_list = self.get_examples_candidates(question, filter_condition, self.num_examples)
|
|
||||||
fewshot_example_list_combo = self.get_fewshot_example_combos(fewshot_example_meta_list, self.num_fewshots)
|
|
||||||
|
|
||||||
schema_linking_candidate_list = await self.generate_schema_linking_tasks(question, domain_name, fields_list, prior_schema_links, fewshot_example_list_combo, llm_config)
|
|
||||||
logger.debug(f'schema_linking_candidate_list:{schema_linking_candidate_list}')
|
|
||||||
schema_linking_candidate_sorted_list = self.schema_linking_list_str_unify(schema_linking_candidate_list)
|
|
||||||
logger.debug(f'schema_linking_candidate_sorted_list:{schema_linking_candidate_sorted_list}')
|
|
||||||
|
|
||||||
schema_linking_output_max, schema_linking_output_vote_percentage = self.self_consistency_vote(schema_linking_candidate_sorted_list)
|
|
||||||
|
|
||||||
sql_output_candicates = await self.generate_sql_tasks(question, domain_name, data_date, schema_linking_candidate_list,fewshot_example_list_combo)
|
|
||||||
logger.debug(f'sql_output_candicates:{sql_output_candicates}')
|
|
||||||
sql_output_max, sql_output_vote_percentage = self.self_consistency_vote(sql_output_candicates)
|
|
||||||
|
|
||||||
resp = dict()
|
|
||||||
resp['question'] = question
|
|
||||||
resp['model'] = domain_name
|
|
||||||
resp['fields'] = fields_list
|
|
||||||
resp['priorSchemaLinking'] = prior_schema_links
|
|
||||||
resp['dataDate'] = data_date
|
|
||||||
|
|
||||||
resp['schemaLinkStr'] = schema_linking_output_max
|
|
||||||
resp['schemaLinkingWeight'] = schema_linking_output_vote_percentage
|
|
||||||
|
|
||||||
resp['sqlOutput'] = sql_output_max
|
|
||||||
resp['sqlWeight'] = sql_output_vote_percentage
|
|
||||||
|
|
||||||
logger.info("resp: {}".format(resp))
|
|
||||||
|
|
||||||
return resp
|
|
||||||
|
|
||||||
async def tasks_run_shortcut(self, question: str, filter_condition: Mapping[str, str], domain_name: str, fields_list: List[str], prior_schema_links: Mapping[str,str], data_date: str, prior_exts: str):
|
|
||||||
logger.info("question: {}".format(question))
|
|
||||||
logger.info("domain_name: {}".format(domain_name))
|
|
||||||
logger.info("fields_list: {}".format(fields_list))
|
|
||||||
logger.info("current_date: {}".format(data_date))
|
|
||||||
logger.info("prior_schema_links: {}".format(prior_schema_links))
|
|
||||||
logger.info("prior_exts: {}".format(prior_exts))
|
|
||||||
|
|
||||||
if prior_exts != '':
|
|
||||||
question = question + ' 备注:'+prior_exts
|
|
||||||
logger.info("question_prior_exts: {}".format(question))
|
|
||||||
|
|
||||||
fewshot_example_meta_list = self.get_examples_candidates(question, filter_condition, self.num_examples)
|
|
||||||
fewshot_example_list_combo = self.get_fewshot_example_combos(fewshot_example_meta_list, self.num_fewshots)
|
|
||||||
|
|
||||||
schema_linking_sql_output_candidates = await self.generate_schema_linking_sql_tasks(question, domain_name, fields_list, data_date, prior_schema_links, fewshot_example_list_combo)
|
|
||||||
logger.debug(f'schema_linking_sql_output_candidates:{schema_linking_sql_output_candidates}')
|
|
||||||
schema_linking_output_candidate_list = [combo_schema_link_parse(schema_linking_sql_output_candidate) for schema_linking_sql_output_candidate in schema_linking_sql_output_candidates]
|
|
||||||
logger.debug(f'schema_linking_sql_output_candidate_list:{schema_linking_output_candidate_list}')
|
|
||||||
schema_linking_output_candidate_sorted_list = self.schema_linking_list_str_unify(schema_linking_output_candidate_list)
|
|
||||||
|
|
||||||
schema_linking_output_max, schema_linking_output_vote_percentage = self.self_consistency_vote(schema_linking_output_candidate_sorted_list)
|
|
||||||
|
|
||||||
sql_output_candidate_list = [combo_sql_parse(schema_linking_sql_output_candidate) for schema_linking_sql_output_candidate in schema_linking_sql_output_candidates]
|
|
||||||
logger.debug(f'sql_output_candidate_list:{sql_output_candidate_list}')
|
|
||||||
sql_output_max, sql_output_vote_percentage = self.self_consistency_vote(sql_output_candidate_list)
|
|
||||||
|
|
||||||
resp = dict()
|
|
||||||
resp['question'] = question
|
|
||||||
resp['model'] = domain_name
|
|
||||||
resp['fields'] = fields_list
|
|
||||||
resp['priorSchemaLinking'] = prior_schema_links
|
|
||||||
resp['dataDate'] = data_date
|
|
||||||
|
|
||||||
resp['schemaLinkStr'] = schema_linking_output_max
|
|
||||||
resp['schemaLinkingWeight'] = schema_linking_output_vote_percentage
|
|
||||||
|
|
||||||
resp['sqlOutput'] = sql_output_max
|
|
||||||
resp['sqlWeight'] = sql_output_vote_percentage
|
|
||||||
|
|
||||||
logger.info("resp: {}".format(resp))
|
|
||||||
|
|
||||||
return resp
|
|
||||||
|
|
||||||
async def async_query2sql(self, question: str, filter_condition: Mapping[str,str],
|
|
||||||
model_name: str, fields_list: List[str],
|
|
||||||
data_date: str, prior_schema_links: Mapping[str,str], prior_exts: str, llm_config: dict):
|
|
||||||
logger.info("question: {}".format(question))
|
|
||||||
logger.info("model_name: {}".format(model_name))
|
|
||||||
logger.info("fields_list: {}".format(fields_list))
|
|
||||||
logger.info("data_date: {}".format(data_date))
|
|
||||||
logger.info("prior_schema_links: {}".format(prior_schema_links))
|
|
||||||
logger.info("prior_exts: {}".format(prior_exts))
|
|
||||||
|
|
||||||
if prior_exts != '':
|
|
||||||
question = question + ' 备注:'+prior_exts
|
|
||||||
logger.info("question_prior_exts: {}".format(question))
|
|
||||||
|
|
||||||
fewshot_example_meta_list = self.get_examples_candidates(question, filter_condition, self.num_examples)
|
|
||||||
schema_linking_prompt = self.generate_schema_linking_prompt(question, model_name, fields_list, prior_schema_links, fewshot_example_meta_list)
|
|
||||||
logger.debug("schema_linking_prompt->{}".format(schema_linking_prompt))
|
|
||||||
llm = get_llm(llm_config)
|
|
||||||
schema_link_output = await llm._call_async(schema_linking_prompt)
|
|
||||||
|
|
||||||
schema_link_str = schema_link_parse(schema_link_output)
|
|
||||||
|
|
||||||
sql_prompt = self.generate_sql_prompt(question, model_name, schema_link_str, data_date, fewshot_example_meta_list)
|
|
||||||
logger.debug("sql_prompt->{}".format(sql_prompt))
|
|
||||||
sql_output = await llm._call_async(sql_prompt)
|
|
||||||
|
|
||||||
resp = dict()
|
|
||||||
resp['question'] = question
|
|
||||||
resp['model'] = model_name
|
|
||||||
resp['fields'] = fields_list
|
|
||||||
resp['priorSchemaLinking'] = prior_schema_links
|
|
||||||
resp['dataDate'] = data_date
|
|
||||||
|
|
||||||
resp['schemaLinkingOutput'] = schema_link_output
|
|
||||||
resp['schemaLinkStr'] = schema_link_str
|
|
||||||
|
|
||||||
resp['sqlOutput'] = sql_output
|
|
||||||
|
|
||||||
logger.info("resp: {}".format(resp))
|
|
||||||
|
|
||||||
return resp
|
|
||||||
|
|
||||||
async def async_query2sql_shortcut(self, question: str, filter_condition: Mapping[str,str],
|
|
||||||
model_name: str, fields_list: List[str],
|
|
||||||
data_date: str, prior_schema_links: Mapping[str,str], prior_exts: str,
|
|
||||||
llm_config: dict):
|
|
||||||
|
|
||||||
logger.info("question: {}".format(question))
|
|
||||||
logger.info("model_name: {}".format(model_name))
|
|
||||||
logger.info("fields_list: {}".format(fields_list))
|
|
||||||
logger.info("data_date: {}".format(data_date))
|
|
||||||
logger.info("prior_schema_links: {}".format(prior_schema_links))
|
|
||||||
logger.info("prior_exts: {}".format(prior_exts))
|
|
||||||
|
|
||||||
if prior_exts != '':
|
|
||||||
question = question + ' 备注:'+prior_exts
|
|
||||||
logger.info("question_prior_exts: {}".format(question))
|
|
||||||
|
|
||||||
fewshot_example_meta_list = self.get_examples_candidates(question, filter_condition, self.num_examples)
|
|
||||||
schema_linking_sql_shortcut_prompt = self.generate_schema_linking_sql_prompt(question, model_name, data_date, fields_list, prior_schema_links, fewshot_example_meta_list)
|
|
||||||
logger.debug("schema_linking_sql_shortcut_prompt->{}".format(schema_linking_sql_shortcut_prompt))
|
|
||||||
llm = get_llm(llm_config)
|
|
||||||
schema_linking_sql_shortcut_output = await llm._call_async(schema_linking_sql_shortcut_prompt)
|
|
||||||
|
|
||||||
schema_linking_str = combo_schema_link_parse(schema_linking_sql_shortcut_output)
|
|
||||||
sql_str = combo_sql_parse(schema_linking_sql_shortcut_output)
|
|
||||||
|
|
||||||
resp = dict()
|
|
||||||
resp['question'] = question
|
|
||||||
resp['model'] = model_name
|
|
||||||
resp['fields'] = fields_list
|
|
||||||
resp['priorSchemaLinking'] = prior_schema_links
|
|
||||||
resp['dataDate'] = data_date
|
|
||||||
|
|
||||||
resp['schemaLinkingComboOutput'] = schema_linking_sql_shortcut_output
|
|
||||||
resp['schemaLinkStr'] = schema_linking_str
|
|
||||||
resp['sqlOutput'] = sql_str
|
|
||||||
|
|
||||||
logger.info("resp: {}".format(resp))
|
|
||||||
|
|
||||||
return resp
|
|
||||||
|
|
||||||
class SqlModeEnum(Enum):
|
|
||||||
VALUE5 = '1_pass_auto_cot'
|
|
||||||
VALUE6 = '1_pass_auto_cot_self_consistency'
|
|
||||||
VALUE7 = '2_pass_auto_cot'
|
|
||||||
VALUE8 = '2_pass_auto_cot_self_consistency'
|
|
||||||
|
|
||||||
class Text2DSLAgentWrapper(object):
|
|
||||||
def __init__(self, sql_agent_act:Text2DSLAgentAutoCoT):
|
|
||||||
self.sql_agent_act = sql_agent_act
|
|
||||||
|
|
||||||
async def async_query2sql(self, question: str, filter_condition: Mapping[str,str],
|
|
||||||
model_name: str, fields_list: List[str],
|
|
||||||
data_date: str, prior_schema_links: Mapping[str,str], prior_exts: str, sql_generation_mode: str, llm_config: dict, terms_list: Optional[List[Dict]] = []):
|
|
||||||
|
|
||||||
if sql_generation_mode not in (sql_mode.value for sql_mode in SqlModeEnum):
|
|
||||||
raise ValueError(f"sql_generation_mode: {sql_generation_mode} is not in SqlModeEnum")
|
|
||||||
|
|
||||||
if sql_generation_mode == '1_pass_auto_cot':
|
|
||||||
logger.info(f"sql wrapper: {sql_generation_mode}")
|
|
||||||
resp = await self.sql_agent_act.async_query2sql_shortcut(question=question, filter_condition=filter_condition, model_name=model_name, fields_list=fields_list, current_date=data_date, prior_schema_links=prior_schema_links, prior_exts=prior_exts, llm_config=llm_config, terms_list=terms_list)
|
|
||||||
return resp
|
|
||||||
elif sql_generation_mode == '1_pass_auto_cot_self_consistency':
|
|
||||||
logger.info(f"sql wrapper: {sql_generation_mode}")
|
|
||||||
resp = await self.sql_agent_act.tasks_run_shortcut(question=question, filter_condition=filter_condition, model_name=model_name, fields_list=fields_list, current_date=data_date, prior_schema_links=prior_schema_links, prior_exts=prior_exts, llm_config=llm_config, terms_list=terms_list)
|
|
||||||
return resp
|
|
||||||
elif sql_generation_mode == '2_pass_auto_cot':
|
|
||||||
logger.info(f"sql wrapper: {sql_generation_mode}")
|
|
||||||
resp = await self.sql_agent_act.async_query2sql(question=question, filter_condition=filter_condition, model_name=model_name, fields_list=fields_list, current_date=data_date, prior_schema_links=prior_schema_links, prior_exts=prior_exts, llm_config=llm_config, terms_list=terms_list)
|
|
||||||
return resp
|
|
||||||
elif sql_generation_mode == '2_pass_auto_cot_self_consistency':
|
|
||||||
logger.info(f"sql wrapper: {sql_generation_mode}")
|
|
||||||
resp = await self.sql_agent_act.tasks_run(question=question, filter_condition=filter_condition, model_name=model_name, fields_list=fields_list, current_date=data_date, prior_schema_links=prior_schema_links, prior_exts=prior_exts, llm_config=llm_config, terms_list=terms_list)
|
|
||||||
return resp
|
|
||||||
else:
|
|
||||||
raise ValueError(f'sql_generation_mode:{sql_generation_mode} is not in SqlModeEnum')
|
|
||||||
|
|
||||||
def update_configs(self, sql_example_ids:List[str], sql_example_units: List[Mapping[str, str]],
|
|
||||||
num_examples: int, num_fewshots: int, num_self_consistency: int):
|
|
||||||
self.sql_agent_act.reload_setting(sql_example_ids=sql_example_ids, sql_example_units=sql_example_units, num_examples=num_examples, num_fewshots=num_fewshots, num_self_consistency=num_self_consistency)
|
|
||||||
|
|
||||||
def add_examples(self, sql_example_ids:List[str], sql_example_units: List[Mapping[str, str]]):
|
|
||||||
self.sql_agent_act.add_examples(sql_example_ids=sql_example_ids, sql_example_units=sql_example_units)
|
|
||||||
|
|
||||||
def update_examples(self, sql_example_ids:List[str], sql_example_units: List[Mapping[str, str]]):
|
|
||||||
self.sql_agent_act.update_examples(sql_example_ids=sql_example_ids, sql_example_units=sql_example_units)
|
|
||||||
|
|
||||||
def delete_examples(self, sql_example_ids:List[str]):
|
|
||||||
self.sql_agent_act.delete_examples(sql_example_ids=sql_example_ids)
|
|
||||||
|
|
||||||
def get_examples(self, sql_example_ids: List[str]):
|
|
||||||
sql_agent_act_examples = self.sql_agent_act.get_examples(sql_example_ids=sql_example_ids)
|
|
||||||
|
|
||||||
return sql_agent_act_examples
|
|
||||||
|
|
||||||
def count_examples(self):
|
|
||||||
sql_agent_examples_act_cnt = self.sql_agent_act.count_examples()
|
|
||||||
|
|
||||||
return sql_agent_examples_act_cnt
|
|
||||||
@@ -1,33 +0,0 @@
|
|||||||
# -*- coding:utf-8 -*-
|
|
||||||
import os
|
|
||||||
import sys
|
|
||||||
from typing import Any, List, Mapping, Optional, Union
|
|
||||||
|
|
||||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
||||||
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
|
||||||
|
|
||||||
from fastapi import APIRouter, Depends, HTTPException
|
|
||||||
|
|
||||||
from services.plugin_call.run import plugin_selection_run
|
|
||||||
|
|
||||||
|
|
||||||
router = APIRouter()
|
|
||||||
|
|
||||||
@router.post("/plugin_selection/")
|
|
||||||
async def tool_selection(query_body: Mapping[str, Any]):
|
|
||||||
if "queryText" not in query_body:
|
|
||||||
raise HTTPException(status_code=400, detail="query_text is not in query_body")
|
|
||||||
else:
|
|
||||||
query_text = query_body["queryText"]
|
|
||||||
|
|
||||||
if "pluginConfigs" not in query_body:
|
|
||||||
raise HTTPException(
|
|
||||||
status_code=400, detail="pluginConfigs is not in query_body"
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
plugin_configs = query_body["pluginConfigs"]
|
|
||||||
|
|
||||||
resp = plugin_selection_run(query_text=query_text, plugin_configs=plugin_configs)
|
|
||||||
|
|
||||||
return resp
|
|
||||||
|
|
||||||
@@ -1,71 +0,0 @@
|
|||||||
# -*- coding:utf-8 -*-
|
|
||||||
import os
|
|
||||||
import sys
|
|
||||||
from typing import Any, List, Mapping, Optional, Union
|
|
||||||
|
|
||||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
||||||
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
|
||||||
|
|
||||||
from fastapi import APIRouter, Depends, HTTPException
|
|
||||||
|
|
||||||
from services.query_retrieval.run import preset_query_retriever
|
|
||||||
|
|
||||||
router = APIRouter()
|
|
||||||
|
|
||||||
@router.post("/preset_query_retrival")
|
|
||||||
def preset_query_retrival(query_text_list: List[str], n_results: int = 5):
|
|
||||||
parsed_retrieval_res_format = preset_query_retriever.retrieval_query_run(query_texts_list=query_text_list, filter_condition=None, n_results=n_results)
|
|
||||||
|
|
||||||
return parsed_retrieval_res_format
|
|
||||||
|
|
||||||
|
|
||||||
@router.post("/preset_query_add")
|
|
||||||
def preset_query_add(preset_info_list: List[Mapping[str, str]]):
|
|
||||||
preset_queries = []
|
|
||||||
preset_query_ids = []
|
|
||||||
|
|
||||||
for preset_info in preset_info_list:
|
|
||||||
preset_queries.append(preset_info['preset_query'])
|
|
||||||
preset_query_ids.append(preset_info['preset_query_id'])
|
|
||||||
|
|
||||||
preset_query_retriever.add_queries(query_text_list=preset_queries, query_id_list=preset_query_ids, metadatas=None)
|
|
||||||
|
|
||||||
return "success"
|
|
||||||
|
|
||||||
@router.post("/preset_query_update")
|
|
||||||
def preset_query_update(preset_info_list: List[Mapping[str, str]]):
|
|
||||||
preset_queries = []
|
|
||||||
preset_query_ids = []
|
|
||||||
|
|
||||||
for preset_info in preset_info_list:
|
|
||||||
preset_queries.append(preset_info['preset_query'])
|
|
||||||
preset_query_ids.append(preset_info['preset_query_id'])
|
|
||||||
|
|
||||||
preset_query_retriever.update_queries(query_text_list=preset_queries, query_id_list=preset_query_ids, metadatas=None)
|
|
||||||
|
|
||||||
return "success"
|
|
||||||
|
|
||||||
|
|
||||||
@router.get("/preset_query_empty")
|
|
||||||
def preset_query_empty():
|
|
||||||
preset_query_retriever.empty_query_collection()
|
|
||||||
|
|
||||||
return "success"
|
|
||||||
|
|
||||||
@router.post("/preset_delete_by_ids")
|
|
||||||
def preset_delete_by_ids(preset_query_ids: List[str]):
|
|
||||||
preset_query_retriever.delete_queries_by_ids(preset_query_ids)
|
|
||||||
|
|
||||||
return "success"
|
|
||||||
|
|
||||||
@router.post("/preset_get_by_ids")
|
|
||||||
def preset_get_by_ids(preset_query_ids: List[str]):
|
|
||||||
preset_queries = preset_query_retriever.get_query_by_ids(preset_query_ids)
|
|
||||||
|
|
||||||
return preset_queries
|
|
||||||
|
|
||||||
@router.get("/preset_query_size")
|
|
||||||
def preset_query_size():
|
|
||||||
size = preset_query_retriever.get_query_size()
|
|
||||||
|
|
||||||
return size
|
|
||||||
@@ -1,169 +0,0 @@
|
|||||||
# -*- coding:utf-8 -*-
|
|
||||||
import os
|
|
||||||
import sys
|
|
||||||
import ast
|
|
||||||
from typing import Any, Mapping
|
|
||||||
|
|
||||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
||||||
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
|
||||||
|
|
||||||
from fastapi import APIRouter, HTTPException
|
|
||||||
|
|
||||||
from services.s2sql.run import text2sql_agent_router
|
|
||||||
|
|
||||||
router = APIRouter()
|
|
||||||
|
|
||||||
|
|
||||||
@router.post("/query2sql")
|
|
||||||
async def query2sql(query_body: Mapping[str, Any]):
|
|
||||||
if 'queryText' not in query_body:
|
|
||||||
raise HTTPException(status_code=400, detail="query_text is not in query_body")
|
|
||||||
else:
|
|
||||||
query_text = query_body['queryText']
|
|
||||||
|
|
||||||
if 'schema' not in query_body:
|
|
||||||
raise HTTPException(status_code=400, detail="schema is not in query_body")
|
|
||||||
else:
|
|
||||||
schema = query_body['schema']
|
|
||||||
|
|
||||||
if 'currentDate' not in query_body:
|
|
||||||
raise HTTPException(status_code=400, detail="currentDate is not in query_body")
|
|
||||||
else:
|
|
||||||
current_date = query_body['currentDate']
|
|
||||||
|
|
||||||
if 'linking' not in query_body:
|
|
||||||
raise HTTPException(status_code=400, detail="linking is not in query_body")
|
|
||||||
else:
|
|
||||||
linking = query_body['linking']
|
|
||||||
|
|
||||||
if 'priorExts' not in query_body:
|
|
||||||
raise HTTPException(status_code=400, detail="prior_exts is not in query_body")
|
|
||||||
else:
|
|
||||||
prior_exts = query_body['priorExts']
|
|
||||||
|
|
||||||
if 'filterCondition' not in query_body:
|
|
||||||
raise HTTPException(status_code=400, detail="filterCondition is not in query_body")
|
|
||||||
else:
|
|
||||||
filter_condition = query_body['filterCondition']
|
|
||||||
|
|
||||||
if 'sqlGenType' not in query_body:
|
|
||||||
raise HTTPException(status_code=400, detail="sqlGenType is not in query_body")
|
|
||||||
else:
|
|
||||||
sqlGenType = query_body['sqlGenType']
|
|
||||||
|
|
||||||
if 'llmConfig' in query_body:
|
|
||||||
llm_config = ast.literal_eval(str(query_body['llmConfig']))
|
|
||||||
else:
|
|
||||||
llm_config = None
|
|
||||||
|
|
||||||
dataset_name = schema['dataSetName']
|
|
||||||
fields_list = schema['fieldNameList']
|
|
||||||
prior_schema_links = {item['fieldValue']:item['fieldName'] for item in linking}
|
|
||||||
terms_list = schema['terms']
|
|
||||||
|
|
||||||
resp = await text2sql_agent_router.async_query2sql(question=query_text, filter_condition=filter_condition,
|
|
||||||
model_name=dataset_name, fields_list=fields_list,
|
|
||||||
data_date=current_date, prior_schema_links=prior_schema_links,
|
|
||||||
prior_exts=prior_exts, sql_generation_mode=sqlGenType,
|
|
||||||
llm_config=llm_config, terms_list=terms_list)
|
|
||||||
|
|
||||||
return resp
|
|
||||||
|
|
||||||
|
|
||||||
@router.post("/query2sql_setting_update")
|
|
||||||
def query2sql_setting_update(query_body: Mapping[str, Any]):
|
|
||||||
if 'sqlExamplars' not in query_body:
|
|
||||||
raise HTTPException(status_code=400, detail="sqlExamplars is not in query_body")
|
|
||||||
else:
|
|
||||||
sql_examplars = query_body['sqlExamplars']
|
|
||||||
|
|
||||||
if 'sqlIds' not in query_body:
|
|
||||||
raise HTTPException(status_code=400, detail="sqlIds is not in query_body")
|
|
||||||
else:
|
|
||||||
sql_ids = query_body['sqlIds']
|
|
||||||
|
|
||||||
if 'exampleNums' not in query_body:
|
|
||||||
raise HTTPException(status_code=400, detail="exampleNums is not in query_body")
|
|
||||||
else:
|
|
||||||
example_nums = query_body['exampleNums']
|
|
||||||
|
|
||||||
if 'fewshotNums' not in query_body:
|
|
||||||
raise HTTPException(status_code=400, detail="fewshotNums is not in query_body")
|
|
||||||
else:
|
|
||||||
fewshot_nums = query_body['fewshotNums']
|
|
||||||
|
|
||||||
if 'selfConsistencyNums' not in query_body:
|
|
||||||
raise HTTPException(status_code=400, detail="selfConsistencyNums is not in query_body")
|
|
||||||
else:
|
|
||||||
self_consistency_nums = query_body['selfConsistencyNums']
|
|
||||||
|
|
||||||
text2sql_agent_router.update_configs(sql_example_ids=sql_ids, sql_example_units=sql_examplars,
|
|
||||||
num_examples=example_nums, num_fewshots=fewshot_nums, num_self_consistency=self_consistency_nums)
|
|
||||||
|
|
||||||
return "success"
|
|
||||||
|
|
||||||
|
|
||||||
@router.post("/query2sql_add_examples")
|
|
||||||
def query2sql_add_examples(query_body: Mapping[str, Any]):
|
|
||||||
if 'sqlIds' not in query_body:
|
|
||||||
raise HTTPException(status_code=400, detail="sqlIds is not in query_body")
|
|
||||||
else:
|
|
||||||
sql_ids = query_body['sqlIds']
|
|
||||||
|
|
||||||
if 'sqlExamplars' not in query_body:
|
|
||||||
raise HTTPException(status_code=400,
|
|
||||||
detail="sqlExamplars is not in query_body")
|
|
||||||
else:
|
|
||||||
sql_examplars = query_body['sqlExamplars']
|
|
||||||
|
|
||||||
text2sql_agent_router.add_examples(sql_example_ids=sql_ids, sql_example_units=sql_examplars)
|
|
||||||
|
|
||||||
return "success"
|
|
||||||
|
|
||||||
|
|
||||||
@router.post("/query2sql_update_examples")
|
|
||||||
def query2sql_update_examples(query_body: Mapping[str, Any]):
|
|
||||||
if 'sqlIds' not in query_body:
|
|
||||||
raise HTTPException(status_code=400, detail="sqlIds is not in query_body")
|
|
||||||
else:
|
|
||||||
sql_ids = query_body['sqlIds']
|
|
||||||
|
|
||||||
if 'sqlExamplars' not in query_body:
|
|
||||||
raise HTTPException(status_code=400,
|
|
||||||
detail="sqlExamplars is not in query_body")
|
|
||||||
else:
|
|
||||||
sql_examplars = query_body['sqlExamplars']
|
|
||||||
|
|
||||||
text2sql_agent_router.update_examples(sql_example_ids=sql_ids, sql_example_units=sql_examplars)
|
|
||||||
|
|
||||||
return "success"
|
|
||||||
|
|
||||||
|
|
||||||
@router.post("/query2sql_delete_examples")
|
|
||||||
def query2sql_delete_examples(query_body: Mapping[str, Any]):
|
|
||||||
if 'sqlIds' not in query_body:
|
|
||||||
raise HTTPException(status_code=400, detail="sqlIds is not in query_body")
|
|
||||||
else:
|
|
||||||
sql_ids = query_body['sqlIds']
|
|
||||||
|
|
||||||
text2sql_agent_router.delete_examples(sql_example_ids=sql_ids)
|
|
||||||
|
|
||||||
return "success"
|
|
||||||
|
|
||||||
@router.post("/query2sql_get_examples")
|
|
||||||
def query2sql_get_examples(query_body: Mapping[str, Any]):
|
|
||||||
if 'sqlIds' not in query_body:
|
|
||||||
raise HTTPException(status_code=400, detail="sqlIds is not in query_body")
|
|
||||||
else:
|
|
||||||
sql_ids = query_body['sqlIds']
|
|
||||||
|
|
||||||
examples = text2sql_agent_router.get_examples(sql_example_ids=sql_ids)
|
|
||||||
|
|
||||||
return examples
|
|
||||||
|
|
||||||
@router.get("/query2sql_count_examples")
|
|
||||||
def query2sql_count_examples():
|
|
||||||
examples_cnt = text2sql_agent_router.count_examples()
|
|
||||||
|
|
||||||
return examples_cnt
|
|
||||||
|
|
||||||
@@ -1,156 +0,0 @@
|
|||||||
# -*- coding:utf-8 -*-
|
|
||||||
import os
|
|
||||||
import sys
|
|
||||||
from typing import Any, List, Mapping, Optional, Union
|
|
||||||
|
|
||||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
||||||
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
|
||||||
|
|
||||||
from fastapi import APIRouter, Depends, HTTPException
|
|
||||||
|
|
||||||
from services.query_retrieval.run import collection_manager
|
|
||||||
from services.query_retrieval.retriever import ChromaCollectionRetriever
|
|
||||||
|
|
||||||
router = APIRouter()
|
|
||||||
|
|
||||||
@router.get("/list_collections")
|
|
||||||
def list_collections():
|
|
||||||
collections = collection_manager.list_collections()
|
|
||||||
|
|
||||||
return collections
|
|
||||||
|
|
||||||
@router.get("/create_collection")
|
|
||||||
def create_collection(collection_name: str):
|
|
||||||
collection_manager.create_collection(collection_name)
|
|
||||||
|
|
||||||
return "success"
|
|
||||||
|
|
||||||
@router.get("/delete_collection")
|
|
||||||
def delete_collection(collection_name: str):
|
|
||||||
collection_manager.delete_collection(collection_name)
|
|
||||||
|
|
||||||
return "success"
|
|
||||||
|
|
||||||
@router.get("/get_collection")
|
|
||||||
def get_collection(collection_name: str):
|
|
||||||
collection = collection_manager.get_collection(collection_name)
|
|
||||||
|
|
||||||
return collection
|
|
||||||
|
|
||||||
@router.get("/get_or_create_collection")
|
|
||||||
def get_or_create_collection(collection_name: str):
|
|
||||||
collection = collection_manager.get_or_create_collection(collection_name)
|
|
||||||
|
|
||||||
return collection
|
|
||||||
|
|
||||||
@router.post("/add_query")
|
|
||||||
def query_add(collection_name:str, query_info_list: List[Mapping[str, Any]]):
|
|
||||||
queries = []
|
|
||||||
query_ids = []
|
|
||||||
metadatas = []
|
|
||||||
embeddings = []
|
|
||||||
|
|
||||||
for query_info in query_info_list:
|
|
||||||
queries.append(query_info['query'])
|
|
||||||
query_ids.append(query_info['queryId'])
|
|
||||||
metadatas.append(query_info['metadata'])
|
|
||||||
embeddings.append(query_info['queryEmbedding'])
|
|
||||||
|
|
||||||
if None in embeddings:
|
|
||||||
embeddings = None
|
|
||||||
if None in queries:
|
|
||||||
queries = None
|
|
||||||
|
|
||||||
if embeddings is None and queries is None:
|
|
||||||
raise HTTPException(status_code=400, detail="query and queryEmbedding are None")
|
|
||||||
if embeddings is not None and queries is not None:
|
|
||||||
raise HTTPException(status_code=400, detail="query and queryEmbedding are not None")
|
|
||||||
|
|
||||||
query_collection = collection_manager.get_or_create_collection(collection_name=collection_name)
|
|
||||||
query_retriever = ChromaCollectionRetriever(collection=query_collection)
|
|
||||||
query_retriever.add_queries(query_text_list=queries, query_id_list=query_ids, metadatas=metadatas, embeddings=embeddings)
|
|
||||||
|
|
||||||
return "success"
|
|
||||||
|
|
||||||
@router.post("/update_query")
|
|
||||||
def update_query(collection_name:str, query_info_list: List[Mapping[str, Any]]):
|
|
||||||
queries = []
|
|
||||||
query_ids = []
|
|
||||||
metadatas = []
|
|
||||||
embeddings = []
|
|
||||||
|
|
||||||
for query_info in query_info_list:
|
|
||||||
queries.append(query_info['query'])
|
|
||||||
query_ids.append(query_info['queryId'])
|
|
||||||
metadatas.append(query_info['metadata'])
|
|
||||||
embeddings.append(query_info['queryEmbedding'])
|
|
||||||
|
|
||||||
if None in embeddings:
|
|
||||||
embeddings = None
|
|
||||||
if None in queries:
|
|
||||||
queries = None
|
|
||||||
|
|
||||||
if embeddings is None and queries is None:
|
|
||||||
raise HTTPException(status_code=400, detail="query and queryEmbedding are None")
|
|
||||||
if embeddings is not None and queries is not None:
|
|
||||||
raise HTTPException(status_code=400, detail="query and queryEmbedding are not None")
|
|
||||||
|
|
||||||
query_collection = collection_manager.get_or_create_collection(collection_name=collection_name)
|
|
||||||
query_retriever = ChromaCollectionRetriever(collection=query_collection)
|
|
||||||
query_retriever.update_queries(query_text_list=queries, query_id_list=query_ids, metadatas=metadatas, embeddings=embeddings)
|
|
||||||
|
|
||||||
return "success"
|
|
||||||
|
|
||||||
@router.get("/empty_query")
|
|
||||||
def empty_query(collection_name:str):
|
|
||||||
query_collection = collection_manager.get_or_create_collection(collection_name=collection_name)
|
|
||||||
query_retriever = ChromaCollectionRetriever(collection=query_collection)
|
|
||||||
query_retriever.empty_query_collection()
|
|
||||||
|
|
||||||
return "success"
|
|
||||||
|
|
||||||
|
|
||||||
@router.post("/delete_query_by_ids")
|
|
||||||
def delete_query_by_ids(collection_name:str, query_ids: List[str]):
|
|
||||||
query_collection = collection_manager.get_or_create_collection(collection_name=collection_name)
|
|
||||||
query_retriever = ChromaCollectionRetriever(collection=query_collection)
|
|
||||||
query_retriever.delete_queries_by_ids(query_ids=query_ids)
|
|
||||||
|
|
||||||
return "success"
|
|
||||||
|
|
||||||
@router.post("/get_query_by_ids")
|
|
||||||
def get_query_by_ids(collection_name:str, query_ids: List[str]):
|
|
||||||
query_collection = collection_manager.get_or_create_collection(collection_name=collection_name)
|
|
||||||
query_retriever = ChromaCollectionRetriever(collection=query_collection)
|
|
||||||
queries = query_retriever.get_query_by_ids(query_ids=query_ids)
|
|
||||||
|
|
||||||
return queries
|
|
||||||
|
|
||||||
@router.get("/query_size")
|
|
||||||
def query_size(collection_name:str):
|
|
||||||
query_collection = collection_manager.get_or_create_collection(collection_name=collection_name)
|
|
||||||
query_retriever = ChromaCollectionRetriever(collection=query_collection)
|
|
||||||
size = query_retriever.get_query_size()
|
|
||||||
|
|
||||||
return size
|
|
||||||
|
|
||||||
@router.post("/retrieve_query")
|
|
||||||
def retrieve_query(collection_name:str, query_info: Mapping[str, Any], n_results:int=10):
|
|
||||||
query_collection = collection_manager.get_or_create_collection(collection_name=collection_name)
|
|
||||||
query_retriever = ChromaCollectionRetriever(collection=query_collection)
|
|
||||||
|
|
||||||
query_texts_list = query_info['queryTextsList']
|
|
||||||
qeuery_embeddings = query_info['queryEmbeddings']
|
|
||||||
filter_condition = query_info['filterCondition']
|
|
||||||
|
|
||||||
if query_texts_list is None and qeuery_embeddings is None:
|
|
||||||
raise HTTPException(status_code=400, detail="query and queryEmbedding are None")
|
|
||||||
if query_texts_list is not None and qeuery_embeddings is not None:
|
|
||||||
raise HTTPException(status_code=400, detail="query and queryEmbedding are not None")
|
|
||||||
|
|
||||||
parsed_retrieval_res_format = query_retriever.retrieval_query_run(query_texts_list=query_texts_list,
|
|
||||||
query_embeddings=qeuery_embeddings,
|
|
||||||
filter_condition=filter_condition,
|
|
||||||
n_results=n_results)
|
|
||||||
|
|
||||||
return parsed_retrieval_res_format
|
|
||||||
@@ -1,80 +0,0 @@
|
|||||||
# -*- coding:utf-8 -*-
|
|
||||||
import os
|
|
||||||
import sys
|
|
||||||
from typing import Any, List, Mapping, Optional, Union
|
|
||||||
|
|
||||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
||||||
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
|
||||||
|
|
||||||
from fastapi import APIRouter, Depends, HTTPException
|
|
||||||
|
|
||||||
from services.query_retrieval.run import solved_query_retriever
|
|
||||||
|
|
||||||
router = APIRouter()
|
|
||||||
|
|
||||||
@router.post("/solved_query_retrival")
|
|
||||||
def solved_query_retrival(query_info: Mapping[str, Any], n_results: int = 5):
|
|
||||||
query_texts_list = query_info['queryTextsList']
|
|
||||||
filter_condition = query_info['filterCondition']
|
|
||||||
|
|
||||||
parsed_retrieval_res_format = solved_query_retriever.retrieval_query_run(query_texts_list=query_texts_list,
|
|
||||||
filter_condition=filter_condition,
|
|
||||||
n_results=n_results)
|
|
||||||
|
|
||||||
return parsed_retrieval_res_format
|
|
||||||
|
|
||||||
|
|
||||||
@router.post("/solved_query_add")
|
|
||||||
def add_solved_queries(sovled_query_info_list: List[Mapping[str, Any]]):
|
|
||||||
queries = []
|
|
||||||
query_ids = []
|
|
||||||
metadatas = []
|
|
||||||
|
|
||||||
for sovled_query_info in sovled_query_info_list:
|
|
||||||
queries.append(sovled_query_info['query'])
|
|
||||||
query_ids.append(sovled_query_info['query_id'])
|
|
||||||
metadatas.append(sovled_query_info['metadata'])
|
|
||||||
|
|
||||||
solved_query_retriever.add_queries(query_text_list=queries, query_id_list=query_ids, metadatas=metadatas)
|
|
||||||
|
|
||||||
return "success"
|
|
||||||
|
|
||||||
@router.post("/solved_query_update")
|
|
||||||
def solved_query_update(sovled_query_info_list: List[Mapping[str, Any]]):
|
|
||||||
queries = []
|
|
||||||
query_ids = []
|
|
||||||
metadatas = []
|
|
||||||
|
|
||||||
for sovled_query_info in sovled_query_info_list:
|
|
||||||
queries.append(sovled_query_info['query'])
|
|
||||||
query_ids.append(sovled_query_info['query_id'])
|
|
||||||
metadatas.append(sovled_query_info['metadata'])
|
|
||||||
|
|
||||||
solved_query_retriever.update_queries(query_text_list=queries, query_id_list=query_ids, metadatas=metadatas)
|
|
||||||
|
|
||||||
return "success"
|
|
||||||
|
|
||||||
|
|
||||||
@router.get("/solved_query_empty")
|
|
||||||
def solved_query_empty():
|
|
||||||
solved_query_retriever.empty_query_collection()
|
|
||||||
|
|
||||||
return "success"
|
|
||||||
|
|
||||||
@router.post("/solved_query_delete_by_ids")
|
|
||||||
def solved_query_delete_by_ids(query_ids: List[str]):
|
|
||||||
solved_query_retriever.delete_queries_by_ids(query_ids=query_ids)
|
|
||||||
|
|
||||||
return "success"
|
|
||||||
|
|
||||||
@router.post("/solved_query_get_by_ids")
|
|
||||||
def solved_query_get_by_ids(query_ids: List[str]):
|
|
||||||
queries = solved_query_retriever.get_query_by_ids(query_ids=query_ids)
|
|
||||||
|
|
||||||
return queries
|
|
||||||
|
|
||||||
@router.get("/solved_query_size")
|
|
||||||
def solved_query_size():
|
|
||||||
size = solved_query_retriever.get_query_size()
|
|
||||||
|
|
||||||
return size
|
|
||||||
@@ -1,31 +0,0 @@
|
|||||||
# -*- coding:utf-8 -*-
|
|
||||||
import os
|
|
||||||
import sys
|
|
||||||
|
|
||||||
import uvicorn
|
|
||||||
|
|
||||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
||||||
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
|
||||||
|
|
||||||
from fastapi import FastAPI
|
|
||||||
|
|
||||||
from config.config_parse import LLMPARSER_HOST, LLMPARSER_PORT
|
|
||||||
|
|
||||||
from services_router import (query2sql_service, preset_query_service,
|
|
||||||
solved_query_service, retriever_service)
|
|
||||||
|
|
||||||
|
|
||||||
app = FastAPI()
|
|
||||||
|
|
||||||
@app.get("/health")
|
|
||||||
def read_health():
|
|
||||||
return {"status": "Healthy"}
|
|
||||||
|
|
||||||
app.include_router(preset_query_service.router)
|
|
||||||
app.include_router(solved_query_service.router)
|
|
||||||
app.include_router(query2sql_service.router)
|
|
||||||
#app.include_router(plugin_call_service.router)
|
|
||||||
app.include_router(retriever_service.router)
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
uvicorn.run(app, host=LLMPARSER_HOST, port=LLMPARSER_PORT)
|
|
||||||
@@ -1,154 +0,0 @@
|
|||||||
# -*- coding:utf-8 -*-
|
|
||||||
from typing import Any, List, Mapping, Optional, Union
|
|
||||||
|
|
||||||
import chromadb
|
|
||||||
from chromadb.api import Collection
|
|
||||||
from chromadb.config import Settings
|
|
||||||
from chromadb.api import Collection, Documents, Embeddings
|
|
||||||
|
|
||||||
import os
|
|
||||||
import sys
|
|
||||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
||||||
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
|
||||||
|
|
||||||
from instances.logging_instance import logger
|
|
||||||
|
|
||||||
|
|
||||||
def empty_chroma_collection_2(collection:Collection):
|
|
||||||
collection_name = collection.name
|
|
||||||
client = collection._client
|
|
||||||
metadata = collection.metadata
|
|
||||||
embedding_function = collection._embedding_function
|
|
||||||
|
|
||||||
client.delete_collection(collection_name)
|
|
||||||
|
|
||||||
new_collection = client.get_or_create_collection(name=collection_name,
|
|
||||||
metadata=metadata,
|
|
||||||
embedding_function=embedding_function)
|
|
||||||
|
|
||||||
size_of_new_collection = new_collection.count()
|
|
||||||
|
|
||||||
logger.info(f'Collection {collection_name} emptied. Size of new collection: {size_of_new_collection}')
|
|
||||||
|
|
||||||
return new_collection
|
|
||||||
|
|
||||||
|
|
||||||
def empty_chroma_collection(collection:Collection) -> None:
|
|
||||||
collection.delete()
|
|
||||||
|
|
||||||
|
|
||||||
def add_chroma_collection(collection:Collection,
|
|
||||||
queries:List[str],
|
|
||||||
query_ids:List[str],
|
|
||||||
metadatas:List[Mapping[str, str]]=None,
|
|
||||||
embeddings:Embeddings=None
|
|
||||||
) -> None:
|
|
||||||
|
|
||||||
collection.add(documents=queries,
|
|
||||||
ids=query_ids,
|
|
||||||
metadatas=metadatas,
|
|
||||||
embeddings=embeddings)
|
|
||||||
|
|
||||||
|
|
||||||
def update_chroma_collection(collection:Collection,
|
|
||||||
queries:List[str],
|
|
||||||
query_ids:List[str],
|
|
||||||
metadatas:List[Mapping[str, str]]=None,
|
|
||||||
embeddings:Embeddings=None
|
|
||||||
) -> None:
|
|
||||||
|
|
||||||
collection.update(documents=queries,
|
|
||||||
ids=query_ids,
|
|
||||||
metadatas=metadatas,
|
|
||||||
embeddings=embeddings)
|
|
||||||
|
|
||||||
|
|
||||||
def query_chroma_collection(collection:Collection, query_texts:List[str]=None, query_embeddings:Embeddings=None,
|
|
||||||
filter_condition:Mapping[str,str]=None, n_results:int=10):
|
|
||||||
outer_opt = '$and'
|
|
||||||
inner_opt = '$eq'
|
|
||||||
|
|
||||||
if filter_condition is not None and len(filter_condition) > 0:
|
|
||||||
if len(filter_condition)==1:
|
|
||||||
outer_filter = filter_condition
|
|
||||||
else:
|
|
||||||
inner_filter = [{_k: {inner_opt:_v}} for _k, _v in filter_condition.items()]
|
|
||||||
outer_filter = {outer_opt: inner_filter}
|
|
||||||
else:
|
|
||||||
outer_filter = None
|
|
||||||
|
|
||||||
logger.info('outer_filter: {}'.format(outer_filter))
|
|
||||||
|
|
||||||
return collection.query(query_texts=query_texts, query_embeddings=query_embeddings,
|
|
||||||
n_results=n_results, where=outer_filter)
|
|
||||||
|
|
||||||
def parse_retrieval_chroma_collection_query(res:List[Mapping[str, Any]]):
|
|
||||||
parsed_res = [[] for _ in range(0, len(res['ids']))]
|
|
||||||
|
|
||||||
retrieval_ids = res['ids']
|
|
||||||
retrieval_distances = res['distances']
|
|
||||||
retrieval_sentences = res['documents']
|
|
||||||
retrieval_metadatas = res['metadatas']
|
|
||||||
|
|
||||||
for query_idx in range(0, len(retrieval_ids)):
|
|
||||||
id_ls = retrieval_ids[query_idx]
|
|
||||||
distance_ls = retrieval_distances[query_idx]
|
|
||||||
sentence_ls = retrieval_sentences[query_idx]
|
|
||||||
metadata_ls = retrieval_metadatas[query_idx]
|
|
||||||
|
|
||||||
for idx in range(0, len(id_ls)):
|
|
||||||
id = id_ls[idx]
|
|
||||||
distance = distance_ls[idx]
|
|
||||||
sentence = sentence_ls[idx]
|
|
||||||
metadata = metadata_ls[idx]
|
|
||||||
|
|
||||||
parsed_res[query_idx].append({
|
|
||||||
'id': id,
|
|
||||||
'distance': distance,
|
|
||||||
'query': sentence,
|
|
||||||
'metadata': metadata
|
|
||||||
})
|
|
||||||
|
|
||||||
return parsed_res
|
|
||||||
|
|
||||||
def chroma_collection_query_retrieval_format(query_list:List[str], query_embeddings:Embeddings ,retrieval_list:List[Mapping[str, Any]]):
|
|
||||||
res = []
|
|
||||||
|
|
||||||
if query_list is not None and query_embeddings is not None:
|
|
||||||
raise Exception("query_list and query_embeddings are not None")
|
|
||||||
if query_list is None and query_embeddings is None:
|
|
||||||
raise Exception("query_list and query_embeddings are None")
|
|
||||||
|
|
||||||
if query_list is not None:
|
|
||||||
for query_idx in range(0, len(query_list)):
|
|
||||||
query = query_list[query_idx]
|
|
||||||
retrieval = retrieval_list[query_idx]
|
|
||||||
|
|
||||||
res.append({
|
|
||||||
'query': query,
|
|
||||||
'retrieval': retrieval
|
|
||||||
})
|
|
||||||
else:
|
|
||||||
for query_idx in range(0, len(query_embeddings)):
|
|
||||||
query_embedding = query_embeddings[query_idx]
|
|
||||||
retrieval = retrieval_list[query_idx]
|
|
||||||
|
|
||||||
res.append({
|
|
||||||
'query_embedding': query_embedding,
|
|
||||||
'retrieval': retrieval
|
|
||||||
})
|
|
||||||
|
|
||||||
return res
|
|
||||||
|
|
||||||
|
|
||||||
def delete_chroma_collection_by_ids(collection:Collection, query_ids:List[str]) -> None:
|
|
||||||
collection.delete(ids=query_ids)
|
|
||||||
|
|
||||||
def get_chroma_collection_by_ids(collection:Collection, query_ids:List[str]):
|
|
||||||
res = collection.get(ids=query_ids)
|
|
||||||
|
|
||||||
return res
|
|
||||||
|
|
||||||
def get_chroma_collection_size(collection:Collection) -> int:
|
|
||||||
return collection.count()
|
|
||||||
|
|
||||||
@@ -1,23 +0,0 @@
|
|||||||
# -*- coding:utf-8 -*-
|
|
||||||
from typing import List
|
|
||||||
|
|
||||||
from chromadb.api.types import Documents, EmbeddingFunction, Embeddings
|
|
||||||
from langchain.embeddings import HuggingFaceEmbeddings
|
|
||||||
|
|
||||||
from config.config_parse import HF_TEXT2VEC_MODEL_NAME
|
|
||||||
|
|
||||||
hg_embedding = HuggingFaceEmbeddings(model_name=HF_TEXT2VEC_MODEL_NAME)
|
|
||||||
|
|
||||||
|
|
||||||
class Text2VecEmbeddingFunction(EmbeddingFunction):
|
|
||||||
def __call__(self, texts: Documents) -> Embeddings:
|
|
||||||
|
|
||||||
embeddings = hg_embedding.embed_documents(texts)
|
|
||||||
|
|
||||||
return embeddings
|
|
||||||
|
|
||||||
|
|
||||||
def get_embeddings(documents: List[str]) -> List[List[float]]:
|
|
||||||
embeddings = hg_embedding.embed_documents(documents)
|
|
||||||
|
|
||||||
return embeddings
|
|
||||||
@@ -4,11 +4,11 @@ langchain4j:
|
|||||||
# Replace with your LLM configs
|
# Replace with your LLM configs
|
||||||
# Note: The default API key `demo` is provided by langchain4j community
|
# Note: The default API key `demo` is provided by langchain4j community
|
||||||
# which limits 1000 tokens per request.
|
# which limits 1000 tokens per request.
|
||||||
base-url: ${OPENAI_API_BASE:https://api.openai.com/v1}
|
base-url: https://api.openai.com/v1
|
||||||
api-key: ${OPENAI_API_KEY:demo}
|
api-key: demo
|
||||||
model-name: ${OPENAI_MODEL_NAME:gpt-3.5-turbo}
|
model-name: gpt-3.5-turbo
|
||||||
temperature: ${OPENAI_TEMPERATURE:0.0}
|
temperature: 0.0
|
||||||
timeout: ${OPENAI_TIMEOUT:PT60S}
|
timeout: PT60S
|
||||||
in-memory:
|
in-memory:
|
||||||
embedding-model:
|
embedding-model:
|
||||||
model-name: bge-small-zh
|
model-name: bge-small-zh
|
||||||
|
|||||||
@@ -1,9 +1,5 @@
|
|||||||
s2:
|
s2:
|
||||||
pyllm:
|
|
||||||
url: http://127.0.0.1:9092
|
|
||||||
|
|
||||||
parser:
|
parser:
|
||||||
url: ${s2.pyllm.url}
|
|
||||||
strategy: ONE_PASS_SELF_CONSISTENCY
|
strategy: ONE_PASS_SELF_CONSISTENCY
|
||||||
exemplar-recall:
|
exemplar-recall:
|
||||||
number: 10
|
number: 10
|
||||||
@@ -18,13 +14,6 @@ s2:
|
|||||||
additional:
|
additional:
|
||||||
information: true
|
information: true
|
||||||
date: true
|
date: true
|
||||||
functionCall:
|
|
||||||
url: ${s2.pyllm.url}
|
|
||||||
|
|
||||||
embedding:
|
|
||||||
url: ${s2.pyllm.url}
|
|
||||||
persistent:
|
|
||||||
path: /tmp
|
|
||||||
|
|
||||||
demo:
|
demo:
|
||||||
names: S2VisitsDemo,S2ArtistDemo,SmallTalkDemo
|
names: S2VisitsDemo,S2ArtistDemo,SmallTalkDemo
|
||||||
|
|||||||
@@ -4,11 +4,11 @@ langchain4j:
|
|||||||
# Replace with your LLM configs
|
# Replace with your LLM configs
|
||||||
# Note: The default API key `demo` is provided by langchain4j community
|
# Note: The default API key `demo` is provided by langchain4j community
|
||||||
# which limits 1000 tokens per request.
|
# which limits 1000 tokens per request.
|
||||||
base-url: ${OPENAI_API_BASE:https://api.openai.com/v1}
|
base-url: https://api.openai.com/v1
|
||||||
api-key: ${OPENAI_API_KEY:demo}
|
api-key: demo
|
||||||
model-name: ${OPENAI_MODEL_NAME:gpt-3.5-turbo}
|
model-name: gpt-3.5-turbo
|
||||||
temperature: ${OPENAI_TEMPERATURE:0.0}
|
temperature: 0.0
|
||||||
timeout: ${OPENAI_TIMEOUT:PT60S}
|
timeout: PT60S
|
||||||
in-memory:
|
in-memory:
|
||||||
embedding-model:
|
embedding-model:
|
||||||
model-name: bge-small-zh
|
model-name: bge-small-zh
|
||||||
|
|||||||
@@ -1,9 +1,5 @@
|
|||||||
s2:
|
s2:
|
||||||
pyllm:
|
|
||||||
url: http://127.0.0.1:9092
|
|
||||||
|
|
||||||
parser:
|
parser:
|
||||||
url: ${s2.pyllm.url}
|
|
||||||
strategy: ONE_PASS_SELF_CONSISTENCY
|
strategy: ONE_PASS_SELF_CONSISTENCY
|
||||||
exemplar-recall:
|
exemplar-recall:
|
||||||
number: 10
|
number: 10
|
||||||
@@ -18,13 +14,7 @@ s2:
|
|||||||
additional:
|
additional:
|
||||||
information: true
|
information: true
|
||||||
date: true
|
date: true
|
||||||
functionCall:
|
|
||||||
url: ${s2.pyllm.url}
|
|
||||||
|
|
||||||
embedding:
|
|
||||||
url: ${s2.pyllm.url}
|
|
||||||
persistent:
|
|
||||||
path: /tmp
|
|
||||||
|
|
||||||
demo:
|
demo:
|
||||||
names: S2VisitsDemo,S2ArtistDemo
|
names: S2VisitsDemo,S2ArtistDemo
|
||||||
|
|||||||
44
pom.xml
44
pom.xml
@@ -64,12 +64,6 @@
|
|||||||
<calcite.avatica.version>1.23.0</calcite.avatica.version>
|
<calcite.avatica.version>1.23.0</calcite.avatica.version>
|
||||||
<xk.time.version>3.2.4</xk.time.version>
|
<xk.time.version>3.2.4</xk.time.version>
|
||||||
<mockito-inline.version>4.5.1</mockito-inline.version>
|
<mockito-inline.version>4.5.1</mockito-inline.version>
|
||||||
|
|
||||||
<!-- Do not bump spotless plugin version since 2.30.0 is the latest version supports Java 8-->
|
|
||||||
<maven.plugin.spotless.version>2.30.0</maven.plugin.spotless.version>
|
|
||||||
<spotless.python.includes></spotless.python.includes>
|
|
||||||
<!-- Do not bump black version as decided by spotless maven plugin-->
|
|
||||||
<spotless.python.black.version>22.3.0</spotless.python.black.version>
|
|
||||||
<easyexcel.version>2.2.6</easyexcel.version>
|
<easyexcel.version>2.2.6</easyexcel.version>
|
||||||
<poi.version>3.17</poi.version>
|
<poi.version>3.17</poi.version>
|
||||||
<langchain4j.version>0.31.0</langchain4j.version>
|
<langchain4j.version>0.31.0</langchain4j.version>
|
||||||
@@ -223,15 +217,6 @@
|
|||||||
</dependencies>
|
</dependencies>
|
||||||
</dependencyManagement>
|
</dependencyManagement>
|
||||||
|
|
||||||
<profiles>
|
|
||||||
<profile>
|
|
||||||
<id>spotless-python</id>
|
|
||||||
<properties>
|
|
||||||
<spotless.python.includes>src/**/*.py</spotless.python.includes>
|
|
||||||
</properties>
|
|
||||||
</profile>
|
|
||||||
</profiles>
|
|
||||||
|
|
||||||
<build>
|
<build>
|
||||||
<plugins>
|
<plugins>
|
||||||
<plugin>
|
<plugin>
|
||||||
@@ -281,10 +266,6 @@
|
|||||||
<groupId>org.apache.maven.plugins</groupId>
|
<groupId>org.apache.maven.plugins</groupId>
|
||||||
<artifactId>maven-checkstyle-plugin</artifactId>
|
<artifactId>maven-checkstyle-plugin</artifactId>
|
||||||
</plugin>
|
</plugin>
|
||||||
<plugin>
|
|
||||||
<groupId>com.diffplug.spotless</groupId>
|
|
||||||
<artifactId>spotless-maven-plugin</artifactId>
|
|
||||||
</plugin>
|
|
||||||
</plugins>
|
</plugins>
|
||||||
<pluginManagement>
|
<pluginManagement>
|
||||||
<plugins>
|
<plugins>
|
||||||
@@ -323,31 +304,6 @@
|
|||||||
</execution>
|
</execution>
|
||||||
</executions>
|
</executions>
|
||||||
</plugin>
|
</plugin>
|
||||||
<plugin>
|
|
||||||
<groupId>com.diffplug.spotless</groupId>
|
|
||||||
<artifactId>spotless-maven-plugin</artifactId>
|
|
||||||
<version>${maven.plugin.spotless.version}</version>
|
|
||||||
<configuration>
|
|
||||||
<upToDateChecking>
|
|
||||||
<enabled>true</enabled>
|
|
||||||
</upToDateChecking>
|
|
||||||
<python>
|
|
||||||
<includes>
|
|
||||||
<include>${spotless.python.includes}</include>
|
|
||||||
</includes>
|
|
||||||
<black>
|
|
||||||
<version>${spotless.python.black.version}</version>
|
|
||||||
</black>
|
|
||||||
</python>
|
|
||||||
</configuration>
|
|
||||||
<executions>
|
|
||||||
<execution>
|
|
||||||
<goals>
|
|
||||||
<goal>check</goal>
|
|
||||||
</goals>
|
|
||||||
</execution>
|
|
||||||
</executions>
|
|
||||||
</plugin>
|
|
||||||
</plugins>
|
</plugins>
|
||||||
</pluginManagement>
|
</pluginManagement>
|
||||||
</build>
|
</build>
|
||||||
|
|||||||
Reference in New Issue
Block a user