(feat) add benchmark test (#1227)

2025-12-10 11:07:06 +00:00 · 2024-06-26 14:34:03 +08:00
parent dcbb4dec03
commit 61ecb6274d
6 changed files with 207 additions and 0 deletions
--- a/benchmark/benchmark.md
+++ b/benchmark/benchmark.md
@@ -0,0 +1,83 @@
 ## 使用场景
 产品上线阶段批量测试问答对话的问题，统计测试结果。
 注意：与evaluation模块的区别，evaluation是构建数据集多个模型的横向评估，benchmark是选定模型下，批量自动化业务问题的测试。
 ## 功能说明
 批量自动化测试问答对话测试，支持单轮问答测试。
 ## 使用说明
 注意：建议在开发测试环境的执行，如果需要在生产环境的测试，请避开用户使用高峰期。
 1. 准备测试问题
 将问题写入`test_data.csv`文件，格式如下：
 ```csv
 question
 各BG期间在职、入职、离职人员的平均薪资是多少？（注意：薪资不包括香港视源、广视以及并购控股子公司青松、仙视的数据。）
 各BG期间入职且仍在职的人数有多少？
 各BG当月的净增长人数及其增长率是多少？
 ```
 将文件放入`benchmark/data`目录下。
 2. 执行测试
 ```bash
 python benchmark -u http://localhost:3100 -a 6 -c 141 -f data/renli.csv -p zds
 ```
 参数说明：
 - -a: 问答对话的id
 - -c: chat_id
 - -f: 测试问题文件
 - -u: 用户id
 如果执行报错，没有安装相关python包，可以执行`pip install -r requirements.txt`安装相关包。
 3. 查看测试结果
 当前，只能在数据库中查看测试结果。
 ```sql
 select question_id,chat_id,create_time,query_text,
       JSON_EXTRACT(parse_info,'$.sqlInfo.s2SQL') as s2sql,
       JSON_EXTRACT(parse_info,'$.sqlInfo.correctS2SQL') as correctS2SQL,
       JSON_EXTRACT(parse_info,'$.sqlInfo.querySQL') as querySQL,
       '请标记正确的SQL' as correctSQL,
       '请标记生成SQL是否正确' as isOk,
       '请分类不正确的原因' as reason
 from s2_chat_parse scp where user_name = 'zhaodongsheng' and chat_id = '141';
 select question_id,chat_id,create_time,query_text,
       JSON_EXTRACT(query_result,'$.querySql') as querySql,
       JSON_EXTRACT(query_result,'$.queryResults') as queryResults
 from s2_chat_query where user_name = 'zhaodongsheng' and chat_id = '141' and query_state = 1;
 ```
 4. 查看帮助
 ```bash
 python benchmark.py --help
 usage: benchmark.py [-h] -u URL -a AGENTID -c CHATID -f FILEPATH -p USERNAME
 optional arguments:
  -h, --help            show this help message and exit
  -u URL, --url URL     url:问答系统url,例如：https://chatdata-dev.test.com
  -a AGENTID, --agentId AGENTID
                        agentId：助手ID
  -c CHATID, --chatId CHATID
                        chatId:会话ID,需要通过浏览器开发者模式获取
  -f FILEPATH, --filePath FILEPATH
                        filePath：问题文件路径, csv格式. 请提前上传到benchmark/data目录下
  -p USERNAME, --userName USERNAME
                        userName：用户名，用户获取登录token
 ```
 ## 演示效果
 ```bash
 python benchmark.py -u https://chatdata-dev.test.com -a 3 -c 35 -f data/shuce.csv -p zds
 批量测试配置信息[url: https://chatdata-dev.test.com agentId: 3 chatId: 35 filePath: data/shuce.csv userName: zds ]
 请确认输入的压力测试信息是否正确:
 1. Yes
 2. No
 1
 start to ask question: 各BG期间在职、入职、离职人员的平均薪资是多少？（注意：薪资不包括香港视源、广视以及并购控股子公司青松、仙视的数据。）
 start to ask question: 各BG期间入职且仍在职的人数有多少？
 start to ask question: 各BG当月的净增长人数及其增长率是多少？
 ```
 ## TODO
 - [x] 问答对话测试
 - [ ] 多轮对话测试
 - [ ] 问答对话测试结果展示
--- a/benchmark/benchmark.py
+++ b/benchmark/benchmark.py
@@ -0,0 +1,104 @@
 #!/usr/bin/env python
 # -*- encoding: utf-8 -*-
 # -----------------------------------------------------------------------------------
 '''
@filename     : batchmark.py
@time         : 2024/06/20
@author       : zhaodongsheng
@Version      : 1.0
@description  : 批量问答测试
 '''
 # -----------------------------------------------------------------------------------
 import pandas as pd
 import json
 import requests
 import time
 import jwt
 import traceback
 class BatchTest:
    def __init__(self, url, agentId, chatId, userName):
        self.base_url = url + '/api/chat/query/'
        self.agentId = agentId
        self.auth_token = self.__get_authorization(userName)
        self.chatId = chatId
    def parse(self, query_text):
        url = self.base_url + 'parse'
        data = {
            'queryText': query_text,
            'agentId': self.agentId,
            'chatId': self.chatId,
        }
        headers = {
            'Authorization': 'Bearer ' + self.auth_token,
            'Content-Type': 'application/json',
        }
        response = requests.post(url, headers=headers, data=json.dumps(data))
        return response.json()
    def execute(self, query_text, queryId):
        url = self.base_url + 'execute'
        data = {
            'queryText': query_text,
            'parseId': 1,
            'chatId': self.chatId,
            'queryId': queryId,
        }
        headers = {
            'Authorization': 'Bearer ' + self.auth_token,
            'Content-Type': 'application/json',
        }
        response = requests.post(url, headers=headers, data=json.dumps(data))
        return response.json()
    def read_question_from_csv(self, filePath):
        df = pd.read_csv(filePath)
        return df
    def __get_authorization(self, userName):
        # secret 请和 com.tencent.supersonic.auth.api.authentication.config.AuthenticationConfig.tokenAppSecret 保持一致
        secret = "WIaO9YRRVt+7QtpPvyWsARFngnEcbaKBk783uGFwMrbJBaochsqCH62L4Kijcb0sZCYoSsiKGV/zPml5MnZ3uQ=="
        exp = time.time() + 100000000
        token= jwt.encode({"token_userName": userName,"exp": exp}, secret, algorithm="HS512")
        return token
 def benchmark(url:str, agentId:str, chatId:str, filePath:str, userName:str):
    batch_test = BatchTest(url, agentId, chatId, userName)
    df = batch_test.read_question_from_csv(filePath)
    for index, row in df.iterrows():
        question = row['question']
        print('start to ask question:', question)
        # 捕获异常，防止程序中断
        try:
            parse_resp = batch_test.parse(question)
            batch_test.execute(question, parse_resp['data']['queryId'])
        except Exception as e:
            print('error:', e)
            traceback.print_exc()
            continue
        time.sleep(1)
 if __name__ == '__main__':
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('-u', '--url', type=str, required=True, help='url:问答系统url,例如：https://chatdata-dev.test.com')
    parser.add_argument('-a', '--agentId', type=str, required=True, help='agentId：助手ID')
    parser.add_argument('-c', '--chatId', type=str, required=True, help='chatId:会话ID,需要通过浏览器开发者模式获取')
    parser.add_argument('-f', '--filePath', type=str, required=True, help='filePath：问题文件路径, csv格式. 请提前上传到benchmark/data目录下')
    parser.add_argument('-p', '--userName', type=str, required=True, help='userName：用户名，用户获取登录token')
    args = parser.parse_args()
    print('批量测试配置信息[url:', args.url,'agentId:', args.agentId, 'chatId:', args.chatId, 'filePath:', args.filePath, 'userName:', args.userName, ']')
    print('请确认输入的压力测试信息是否正确:')
    print('1. Yes')
    print('2. No')
    confirm = input()
    if confirm == '1' or confirm == 'Yes' or confirm == 'yes' or confirm == 'YES':
        benchmark(args.url, args.agentId, args.chatId, args.filePath, args.userName)
    else:
        print('请重新输入压力测试配置信息: url, agentId, chatId, filePath, userName')
--- a/benchmark/data/caiwu.csv
+++ b/benchmark/data/caiwu.csv
@@ -0,0 +1,3 @@
 question
 每个业务组（BG）的员工人数是多少？
 每个业务组的损益情况如何？
--- a/benchmark/data/renli.csv
+++ b/benchmark/data/renli.csv
@@ -0,0 +1,8 @@
 question
 在职人员的男女比例是多少？
 期间入职且离职的人数及其占比如何？
 期间新入职社招人员的平均年龄是多少？
 期间入职且在职的人数有多少？
 期间在职人员的平均年龄是多少？
 当月的净增长人数及其增长率是多少？
 期间新入职社招人员的年龄分布情况如何？
--- a/benchmark/data/shuce.csv
+++ b/benchmark/data/shuce.csv
@@ -0,0 +1,4 @@
 question
 在广东省内，哪一个学校的累计集备数最多，请返回该学校的学校名称
 在广东省内，哪一个学校的累计集体备课数最多，请返回该学校的学校名称
--- a/benchmark/requirements.txt
+++ b/benchmark/requirements.txt
@@ -0,0 +1,5 @@
 pandas==2.0.3
 PyJWT==2.8.0
 requests==2.28.2