From 61ecb6274d509359c2e289ae6550f441f6008bcb Mon Sep 17 00:00:00 2001 From: zhaodongsheng Date: Wed, 26 Jun 2024 14:34:03 +0800 Subject: [PATCH] (feat) add benchmark test (#1227) --- benchmark/benchmark.md | 83 +++++++++++++++++++++++++++++ benchmark/benchmark.py | 104 +++++++++++++++++++++++++++++++++++++ benchmark/data/caiwu.csv | 3 ++ benchmark/data/renli.csv | 8 +++ benchmark/data/shuce.csv | 4 ++ benchmark/requirements.txt | 5 ++ 6 files changed, 207 insertions(+) create mode 100644 benchmark/benchmark.md create mode 100644 benchmark/benchmark.py create mode 100644 benchmark/data/caiwu.csv create mode 100644 benchmark/data/renli.csv create mode 100644 benchmark/data/shuce.csv create mode 100644 benchmark/requirements.txt diff --git a/benchmark/benchmark.md b/benchmark/benchmark.md new file mode 100644 index 000000000..a14c355c5 --- /dev/null +++ b/benchmark/benchmark.md @@ -0,0 +1,83 @@ +## 使用场景 +产品上线阶段批量测试问答对话的问题,统计测试结果。 +注意:与evaluation模块的区别,evaluation是构建数据集多个模型的横向评估,benchmark是选定模型下,批量自动化业务问题的测试。 +## 功能说明 +批量自动化测试问答对话测试,支持单轮问答测试。 + +## 使用说明 +注意:建议在开发测试环境的执行,如果需要在生产环境的测试,请避开用户使用高峰期。 +1. 准备测试问题 + +将问题写入`test_data.csv`文件,格式如下: +```csv +question +各BG期间在职、入职、离职人员的平均薪资是多少?(注意:薪资不包括香港视源、广视以及并购控股子公司青松、仙视的数据。) +各BG期间入职且仍在职的人数有多少? +各BG当月的净增长人数及其增长率是多少? +``` +将文件放入`benchmark/data`目录下。 + +2. 执行测试 +```bash +python benchmark -u http://localhost:3100 -a 6 -c 141 -f data/renli.csv -p zds +``` +参数说明: +- -a: 问答对话的id +- -c: chat_id +- -f: 测试问题文件 +- -u: 用户id +如果执行报错,没有安装相关python包,可以执行`pip install -r requirements.txt`安装相关包。 + +3. 查看测试结果 +当前,只能在数据库中查看测试结果。 +```sql +select question_id,chat_id,create_time,query_text, + JSON_EXTRACT(parse_info,'$.sqlInfo.s2SQL') as s2sql, + JSON_EXTRACT(parse_info,'$.sqlInfo.correctS2SQL') as correctS2SQL, + JSON_EXTRACT(parse_info,'$.sqlInfo.querySQL') as querySQL, + '请标记正确的SQL' as correctSQL, + '请标记生成SQL是否正确' as isOk, + '请分类不正确的原因' as reason +from s2_chat_parse scp where user_name = 'zhaodongsheng' and chat_id = '141'; + +select question_id,chat_id,create_time,query_text, + JSON_EXTRACT(query_result,'$.querySql') as querySql, + JSON_EXTRACT(query_result,'$.queryResults') as queryResults +from s2_chat_query where user_name = 'zhaodongsheng' and chat_id = '141' and query_state = 1; + +``` +4. 查看帮助 +```bash +python benchmark.py --help +usage: benchmark.py [-h] -u URL -a AGENTID -c CHATID -f FILEPATH -p USERNAME + +optional arguments: + -h, --help show this help message and exit + -u URL, --url URL url:问答系统url,例如:https://chatdata-dev.test.com + -a AGENTID, --agentId AGENTID + agentId:助手ID + -c CHATID, --chatId CHATID + chatId:会话ID,需要通过浏览器开发者模式获取 + -f FILEPATH, --filePath FILEPATH + filePath:问题文件路径, csv格式. 请提前上传到benchmark/data目录下 + -p USERNAME, --userName USERNAME + userName:用户名,用户获取登录token +``` + +## 演示效果 +```bash +python benchmark.py -u https://chatdata-dev.test.com -a 3 -c 35 -f data/shuce.csv -p zds +批量测试配置信息[url: https://chatdata-dev.test.com agentId: 3 chatId: 35 filePath: data/shuce.csv userName: zds ] +请确认输入的压力测试信息是否正确: +1. Yes +2. No +1 +start to ask question: 各BG期间在职、入职、离职人员的平均薪资是多少?(注意:薪资不包括香港视源、广视以及并购控股子公司青松、仙视的数据。) +start to ask question: 各BG期间入职且仍在职的人数有多少? +start to ask question: 各BG当月的净增长人数及其增长率是多少? +``` + +## TODO +- [x] 问答对话测试 +- [ ] 多轮对话测试 +- [ ] 问答对话测试结果展示 \ No newline at end of file diff --git a/benchmark/benchmark.py b/benchmark/benchmark.py new file mode 100644 index 000000000..538c5df49 --- /dev/null +++ b/benchmark/benchmark.py @@ -0,0 +1,104 @@ +#!/usr/bin/env python +# -*- encoding: utf-8 -*- +# ----------------------------------------------------------------------------------- +''' +@filename : batchmark.py +@time : 2024/06/20 +@author : zhaodongsheng +@Version : 1.0 +@description : 批量问答测试 +''' +# ----------------------------------------------------------------------------------- +import pandas as pd +import json +import requests +import time +import jwt +import traceback + +class BatchTest: + def __init__(self, url, agentId, chatId, userName): + self.base_url = url + '/api/chat/query/' + self.agentId = agentId + self.auth_token = self.__get_authorization(userName) + self.chatId = chatId + + def parse(self, query_text): + url = self.base_url + 'parse' + data = { + 'queryText': query_text, + 'agentId': self.agentId, + 'chatId': self.chatId, + } + headers = { + 'Authorization': 'Bearer ' + self.auth_token, + 'Content-Type': 'application/json', + } + + response = requests.post(url, headers=headers, data=json.dumps(data)) + return response.json() + + def execute(self, query_text, queryId): + url = self.base_url + 'execute' + data = { + 'queryText': query_text, + 'parseId': 1, + 'chatId': self.chatId, + 'queryId': queryId, + } + headers = { + 'Authorization': 'Bearer ' + self.auth_token, + 'Content-Type': 'application/json', + } + + response = requests.post(url, headers=headers, data=json.dumps(data)) + return response.json() + + def read_question_from_csv(self, filePath): + df = pd.read_csv(filePath) + return df + + def __get_authorization(self, userName): + # secret 请和 com.tencent.supersonic.auth.api.authentication.config.AuthenticationConfig.tokenAppSecret 保持一致 + secret = "WIaO9YRRVt+7QtpPvyWsARFngnEcbaKBk783uGFwMrbJBaochsqCH62L4Kijcb0sZCYoSsiKGV/zPml5MnZ3uQ==" + exp = time.time() + 100000000 + token= jwt.encode({"token_userName": userName,"exp": exp}, secret, algorithm="HS512") + return token + + +def benchmark(url:str, agentId:str, chatId:str, filePath:str, userName:str): + batch_test = BatchTest(url, agentId, chatId, userName) + df = batch_test.read_question_from_csv(filePath) + for index, row in df.iterrows(): + question = row['question'] + print('start to ask question:', question) + # 捕获异常,防止程序中断 + try: + parse_resp = batch_test.parse(question) + batch_test.execute(question, parse_resp['data']['queryId']) + except Exception as e: + print('error:', e) + traceback.print_exc() + continue + time.sleep(1) + +if __name__ == '__main__': + + import argparse + parser = argparse.ArgumentParser() + parser.add_argument('-u', '--url', type=str, required=True, help='url:问答系统url,例如:https://chatdata-dev.test.com') + parser.add_argument('-a', '--agentId', type=str, required=True, help='agentId:助手ID') + parser.add_argument('-c', '--chatId', type=str, required=True, help='chatId:会话ID,需要通过浏览器开发者模式获取') + parser.add_argument('-f', '--filePath', type=str, required=True, help='filePath:问题文件路径, csv格式. 请提前上传到benchmark/data目录下') + parser.add_argument('-p', '--userName', type=str, required=True, help='userName:用户名,用户获取登录token') + args = parser.parse_args() + + print('批量测试配置信息[url:', args.url,'agentId:', args.agentId, 'chatId:', args.chatId, 'filePath:', args.filePath, 'userName:', args.userName, ']') + print('请确认输入的压力测试信息是否正确:') + print('1. Yes') + print('2. No') + confirm = input() + if confirm == '1' or confirm == 'Yes' or confirm == 'yes' or confirm == 'YES': + benchmark(args.url, args.agentId, args.chatId, args.filePath, args.userName) + else: + print('请重新输入压力测试配置信息: url, agentId, chatId, filePath, userName') \ No newline at end of file diff --git a/benchmark/data/caiwu.csv b/benchmark/data/caiwu.csv new file mode 100644 index 000000000..6209bbacd --- /dev/null +++ b/benchmark/data/caiwu.csv @@ -0,0 +1,3 @@ +question +每个业务组(BG)的员工人数是多少? +每个业务组的损益情况如何? \ No newline at end of file diff --git a/benchmark/data/renli.csv b/benchmark/data/renli.csv new file mode 100644 index 000000000..3faa5e53a --- /dev/null +++ b/benchmark/data/renli.csv @@ -0,0 +1,8 @@ +question +在职人员的男女比例是多少? +期间入职且离职的人数及其占比如何? +期间新入职社招人员的平均年龄是多少? +期间入职且在职的人数有多少? +期间在职人员的平均年龄是多少? +当月的净增长人数及其增长率是多少? +期间新入职社招人员的年龄分布情况如何? diff --git a/benchmark/data/shuce.csv b/benchmark/data/shuce.csv new file mode 100644 index 000000000..95bb75ec3 --- /dev/null +++ b/benchmark/data/shuce.csv @@ -0,0 +1,4 @@ +question +在广东省内,哪一个学校的累计集备数最多,请返回该学校的学校名称 +在广东省内,哪一个学校的累计集体备课数最多,请返回该学校的学校名称 + diff --git a/benchmark/requirements.txt b/benchmark/requirements.txt new file mode 100644 index 000000000..d6f5648d8 --- /dev/null +++ b/benchmark/requirements.txt @@ -0,0 +1,5 @@ +pandas==2.0.3 +PyJWT==2.8.0 +requests==2.28.2 + +