mirror of https://github.com/Mai-with-u/MaiBot.git
add;添加表达方式检查脚本
parent
7cbc2f1462
commit
e338edae92
|
|
@ -1,295 +0,0 @@
|
||||||
"""
|
|
||||||
表达方式评估脚本
|
|
||||||
|
|
||||||
功能:
|
|
||||||
1. 随机读取10条表达方式,获取其situation和style
|
|
||||||
2. 使用LLM对表达方式进行评估(每个表达方式单独评估)
|
|
||||||
3. 如果合适,就通过,如果不合适,就丢弃
|
|
||||||
4. 不真正修改数据库,只是做评估
|
|
||||||
"""
|
|
||||||
|
|
||||||
import asyncio
|
|
||||||
import random
|
|
||||||
import json
|
|
||||||
import sys
|
|
||||||
import os
|
|
||||||
|
|
||||||
# 添加项目根目录到路径
|
|
||||||
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
|
|
||||||
sys.path.insert(0, project_root)
|
|
||||||
|
|
||||||
from src.common.database.database_model import Expression
|
|
||||||
from src.common.database.database import db
|
|
||||||
from src.llm_models.utils_model import LLMRequest
|
|
||||||
from src.config.config import model_config
|
|
||||||
from src.common.logger import get_logger
|
|
||||||
|
|
||||||
logger = get_logger("expression_evaluator")
|
|
||||||
|
|
||||||
|
|
||||||
def get_random_expressions(count: int = 10) -> list[Expression]:
|
|
||||||
"""
|
|
||||||
随机读取指定数量的表达方式
|
|
||||||
|
|
||||||
Args:
|
|
||||||
count: 要读取的数量,默认10条
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
表达方式列表
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
# 查询所有表达方式
|
|
||||||
all_expressions = list(Expression.select())
|
|
||||||
|
|
||||||
if not all_expressions:
|
|
||||||
logger.warning("数据库中没有表达方式记录")
|
|
||||||
return []
|
|
||||||
|
|
||||||
# 如果总数少于请求数量,返回所有
|
|
||||||
if len(all_expressions) <= count:
|
|
||||||
logger.info(f"数据库中共有 {len(all_expressions)} 条表达方式,全部返回")
|
|
||||||
return all_expressions
|
|
||||||
|
|
||||||
# 随机选择指定数量
|
|
||||||
selected = random.sample(all_expressions, count)
|
|
||||||
logger.info(f"从 {len(all_expressions)} 条表达方式中随机选择了 {len(selected)} 条")
|
|
||||||
return selected
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"随机读取表达方式失败: {e}")
|
|
||||||
import traceback
|
|
||||||
logger.error(traceback.format_exc())
|
|
||||||
return []
|
|
||||||
|
|
||||||
|
|
||||||
def create_evaluation_prompt(situation: str, style: str) -> str:
|
|
||||||
"""
|
|
||||||
创建评估提示词
|
|
||||||
|
|
||||||
Args:
|
|
||||||
situation: 情境
|
|
||||||
style: 风格
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
评估提示词
|
|
||||||
"""
|
|
||||||
prompt = f"""请评估以下表达方式是否合适:
|
|
||||||
|
|
||||||
情境(situation):{situation}
|
|
||||||
风格(style):{style}
|
|
||||||
|
|
||||||
请从以下方面进行评估:
|
|
||||||
1. 情境描述是否清晰、准确
|
|
||||||
2. 风格表达是否合理、自然
|
|
||||||
3. 情境和风格是否匹配
|
|
||||||
4. 是否存在不当内容或表达
|
|
||||||
|
|
||||||
请以JSON格式输出评估结果:
|
|
||||||
{{
|
|
||||||
"suitable": true/false,
|
|
||||||
"reason": "评估理由(如果不合适,请说明原因)"
|
|
||||||
}}
|
|
||||||
|
|
||||||
如果合适,suitable设为true;如果不合适,suitable设为false,并在reason中说明原因。
|
|
||||||
请严格按照JSON格式输出,不要包含其他内容。"""
|
|
||||||
|
|
||||||
return prompt
|
|
||||||
|
|
||||||
|
|
||||||
async def evaluate_expression(expression: Expression, llm: LLMRequest) -> dict:
|
|
||||||
"""
|
|
||||||
使用LLM评估单个表达方式
|
|
||||||
|
|
||||||
Args:
|
|
||||||
expression: 表达方式对象
|
|
||||||
llm: LLM请求实例
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
评估结果字典,包含:
|
|
||||||
- expression_id: 表达方式ID
|
|
||||||
- situation: 情境
|
|
||||||
- style: 风格
|
|
||||||
- suitable: 是否合适
|
|
||||||
- reason: 评估理由
|
|
||||||
- error: 错误信息(如果有)
|
|
||||||
"""
|
|
||||||
result = {
|
|
||||||
"expression_id": expression.id,
|
|
||||||
"situation": expression.situation,
|
|
||||||
"style": expression.style,
|
|
||||||
"suitable": None,
|
|
||||||
"reason": None,
|
|
||||||
"error": None
|
|
||||||
}
|
|
||||||
|
|
||||||
try:
|
|
||||||
# 创建评估提示词
|
|
||||||
prompt = create_evaluation_prompt(expression.situation, expression.style)
|
|
||||||
|
|
||||||
# 调用LLM进行评估
|
|
||||||
logger.info(f"正在评估表达方式 ID: {expression.id}, Situation: {expression.situation}, Style: {expression.style}")
|
|
||||||
response, (reasoning, model_name, _) = await llm.generate_response_async(
|
|
||||||
prompt=prompt,
|
|
||||||
temperature=0.3,
|
|
||||||
max_tokens=500
|
|
||||||
)
|
|
||||||
|
|
||||||
logger.debug(f"LLM响应: {response}")
|
|
||||||
logger.debug(f"使用模型: {model_name}")
|
|
||||||
|
|
||||||
# 解析JSON响应
|
|
||||||
try:
|
|
||||||
# 尝试直接解析
|
|
||||||
evaluation = json.loads(response)
|
|
||||||
except json.JSONDecodeError:
|
|
||||||
# 如果直接解析失败,尝试提取JSON部分
|
|
||||||
import re
|
|
||||||
json_match = re.search(r'\{[^{}]*"suitable"[^{}]*\}', response, re.DOTALL)
|
|
||||||
if json_match:
|
|
||||||
evaluation = json.loads(json_match.group())
|
|
||||||
else:
|
|
||||||
raise ValueError("无法从响应中提取JSON格式的评估结果")
|
|
||||||
|
|
||||||
# 提取评估结果
|
|
||||||
result["suitable"] = evaluation.get("suitable", False)
|
|
||||||
result["reason"] = evaluation.get("reason", "未提供理由")
|
|
||||||
|
|
||||||
logger.info(f"表达方式 ID: {expression.id} 评估结果: {'通过' if result['suitable'] else '不通过'}")
|
|
||||||
if result["reason"]:
|
|
||||||
logger.info(f"评估理由: {result['reason']}")
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"评估表达方式 ID: {expression.id} 时出错: {e}")
|
|
||||||
import traceback
|
|
||||||
logger.error(traceback.format_exc())
|
|
||||||
result["error"] = str(e)
|
|
||||||
result["suitable"] = False
|
|
||||||
result["reason"] = f"评估过程出错: {str(e)}"
|
|
||||||
|
|
||||||
return result
|
|
||||||
|
|
||||||
|
|
||||||
async def main():
|
|
||||||
"""主函数"""
|
|
||||||
logger.info("=" * 60)
|
|
||||||
logger.info("开始表达方式评估")
|
|
||||||
logger.info("=" * 60)
|
|
||||||
|
|
||||||
# 初始化数据库连接
|
|
||||||
try:
|
|
||||||
db.connect(reuse_if_open=True)
|
|
||||||
logger.info("数据库连接成功")
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"数据库连接失败: {e}")
|
|
||||||
return
|
|
||||||
|
|
||||||
# 1. 随机读取10条表达方式
|
|
||||||
logger.info("\n步骤1: 随机读取10条表达方式")
|
|
||||||
expressions = get_random_expressions(10)
|
|
||||||
|
|
||||||
if not expressions:
|
|
||||||
logger.error("没有可用的表达方式,退出")
|
|
||||||
return
|
|
||||||
|
|
||||||
logger.info(f"成功读取 {len(expressions)} 条表达方式")
|
|
||||||
for i, expr in enumerate(expressions, 1):
|
|
||||||
logger.info(f" {i}. ID: {expr.id}, Situation: {expr.situation}, Style: {expr.style}")
|
|
||||||
|
|
||||||
# 2. 创建LLM实例
|
|
||||||
logger.info("\n步骤2: 创建LLM实例")
|
|
||||||
try:
|
|
||||||
llm = LLMRequest(
|
|
||||||
model_set=model_config.model_task_config.tool_use,
|
|
||||||
request_type="expression_evaluator"
|
|
||||||
)
|
|
||||||
logger.info("LLM实例创建成功")
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"创建LLM实例失败: {e}")
|
|
||||||
import traceback
|
|
||||||
logger.error(traceback.format_exc())
|
|
||||||
return
|
|
||||||
|
|
||||||
# 3. 对每个表达方式进行评估
|
|
||||||
logger.info("\n步骤3: 开始评估表达方式")
|
|
||||||
results = []
|
|
||||||
|
|
||||||
for i, expression in enumerate(expressions, 1):
|
|
||||||
logger.info(f"\n--- 评估进度: {i}/{len(expressions)} ---")
|
|
||||||
result = await evaluate_expression(expression, llm)
|
|
||||||
results.append(result)
|
|
||||||
|
|
||||||
# 添加短暂延迟,避免请求过快
|
|
||||||
if i < len(expressions):
|
|
||||||
await asyncio.sleep(0.5)
|
|
||||||
|
|
||||||
# 4. 汇总结果
|
|
||||||
logger.info("\n" + "=" * 60)
|
|
||||||
logger.info("评估结果汇总")
|
|
||||||
logger.info("=" * 60)
|
|
||||||
|
|
||||||
passed = [r for r in results if r["suitable"] is True]
|
|
||||||
failed = [r for r in results if r["suitable"] is False]
|
|
||||||
errors = [r for r in results if r["error"] is not None]
|
|
||||||
|
|
||||||
logger.info(f"\n总计: {len(results)} 条")
|
|
||||||
logger.info(f"通过: {len(passed)} 条")
|
|
||||||
logger.info(f"不通过: {len(failed)} 条")
|
|
||||||
if errors:
|
|
||||||
logger.info(f"出错: {len(errors)} 条")
|
|
||||||
|
|
||||||
# 详细结果
|
|
||||||
logger.info("\n--- 通过的表达方式 ---")
|
|
||||||
if passed:
|
|
||||||
for r in passed:
|
|
||||||
logger.info(f" ID: {r['expression_id']}")
|
|
||||||
logger.info(f" Situation: {r['situation']}")
|
|
||||||
logger.info(f" Style: {r['style']}")
|
|
||||||
if r['reason']:
|
|
||||||
logger.info(f" 理由: {r['reason']}")
|
|
||||||
else:
|
|
||||||
logger.info(" 无")
|
|
||||||
|
|
||||||
logger.info("\n--- 不通过的表达方式 ---")
|
|
||||||
if failed:
|
|
||||||
for r in failed:
|
|
||||||
logger.info(f" ID: {r['expression_id']}")
|
|
||||||
logger.info(f" Situation: {r['situation']}")
|
|
||||||
logger.info(f" Style: {r['style']}")
|
|
||||||
if r['reason']:
|
|
||||||
logger.info(f" 理由: {r['reason']}")
|
|
||||||
if r['error']:
|
|
||||||
logger.info(f" 错误: {r['error']}")
|
|
||||||
else:
|
|
||||||
logger.info(" 无")
|
|
||||||
|
|
||||||
# 保存结果到JSON文件(可选)
|
|
||||||
output_file = os.path.join(project_root, "data", "expression_evaluation_results.json")
|
|
||||||
try:
|
|
||||||
os.makedirs(os.path.dirname(output_file), exist_ok=True)
|
|
||||||
with open(output_file, "w", encoding="utf-8") as f:
|
|
||||||
json.dump({
|
|
||||||
"total": len(results),
|
|
||||||
"passed": len(passed),
|
|
||||||
"failed": len(failed),
|
|
||||||
"errors": len(errors),
|
|
||||||
"results": results
|
|
||||||
}, f, ensure_ascii=False, indent=2)
|
|
||||||
logger.info(f"\n评估结果已保存到: {output_file}")
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning(f"保存结果到文件失败: {e}")
|
|
||||||
|
|
||||||
logger.info("\n" + "=" * 60)
|
|
||||||
logger.info("评估完成")
|
|
||||||
logger.info("=" * 60)
|
|
||||||
|
|
||||||
# 关闭数据库连接
|
|
||||||
try:
|
|
||||||
db.close()
|
|
||||||
logger.info("数据库连接已关闭")
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning(f"关闭数据库连接时出错: {e}")
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
asyncio.run(main())
|
|
||||||
|
|
||||||
|
|
@ -0,0 +1,488 @@
|
||||||
|
"""
|
||||||
|
表达方式LLM评估脚本
|
||||||
|
|
||||||
|
功能:
|
||||||
|
1. 读取已保存的人工评估结果(作为效标)
|
||||||
|
2. 使用LLM对相同项目进行评估
|
||||||
|
3. 对比人工评估和LLM评估的结果,输出分析报告
|
||||||
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import random
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
from typing import List, Dict
|
||||||
|
|
||||||
|
# 添加项目根目录到路径
|
||||||
|
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
|
||||||
|
sys.path.insert(0, project_root)
|
||||||
|
|
||||||
|
from src.llm_models.utils_model import LLMRequest
|
||||||
|
from src.config.config import model_config
|
||||||
|
from src.common.logger import get_logger
|
||||||
|
|
||||||
|
logger = get_logger("expression_evaluator_llm")
|
||||||
|
|
||||||
|
# 评估结果文件路径
|
||||||
|
TEMP_DIR = os.path.join(os.path.dirname(__file__), "temp")
|
||||||
|
MANUAL_EVAL_FILE = os.path.join(TEMP_DIR, "manual_evaluation_results.json")
|
||||||
|
|
||||||
|
|
||||||
|
def load_manual_results() -> List[Dict]:
|
||||||
|
"""
|
||||||
|
加载人工评估结果
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
人工评估结果列表
|
||||||
|
"""
|
||||||
|
if not os.path.exists(MANUAL_EVAL_FILE):
|
||||||
|
logger.error(f"未找到人工评估结果文件: {MANUAL_EVAL_FILE}")
|
||||||
|
print("\n✗ 错误:未找到人工评估结果文件")
|
||||||
|
print(" 请先运行 evaluate_expressions_manual.py 进行人工评估")
|
||||||
|
return []
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(MANUAL_EVAL_FILE, "r", encoding="utf-8") as f:
|
||||||
|
data = json.load(f)
|
||||||
|
results = data.get("manual_results", [])
|
||||||
|
logger.info(f"成功加载 {len(results)} 条人工评估结果")
|
||||||
|
return results
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"加载人工评估结果失败: {e}")
|
||||||
|
print(f"\n✗ 加载人工评估结果失败: {e}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
def create_evaluation_prompt(situation: str, style: str) -> str:
|
||||||
|
"""
|
||||||
|
创建评估提示词
|
||||||
|
|
||||||
|
Args:
|
||||||
|
situation: 情境
|
||||||
|
style: 风格
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
评估提示词
|
||||||
|
"""
|
||||||
|
prompt = f"""请评估以下表达方式或语言风格以及使用条件或使用情景是否合适:
|
||||||
|
使用条件或使用情景:{situation}
|
||||||
|
表达方式或言语风格:{style}
|
||||||
|
|
||||||
|
请从以下方面进行评估:
|
||||||
|
1. 表达方式或言语风格 是否与使用条件或使用情景 匹配
|
||||||
|
2. 允许部分语法错误或口头化或缺省出现
|
||||||
|
3. 表达方式不能太过特指,需要具有泛用性
|
||||||
|
4. 一般不涉及具体的人名或名称
|
||||||
|
|
||||||
|
请以JSON格式输出评估结果:
|
||||||
|
{{
|
||||||
|
"suitable": true/false,
|
||||||
|
"reason": "评估理由(如果不合适,请说明原因)"
|
||||||
|
|
||||||
|
}}
|
||||||
|
如果合适,suitable设为true;如果不合适,suitable设为false,并在reason中说明原因。
|
||||||
|
请严格按照JSON格式输出,不要包含其他内容。"""
|
||||||
|
|
||||||
|
return prompt
|
||||||
|
|
||||||
|
|
||||||
|
async def _single_llm_evaluation(situation: str, style: str, llm: LLMRequest) -> tuple[bool, str, str | None]:
|
||||||
|
"""
|
||||||
|
执行单次LLM评估
|
||||||
|
|
||||||
|
Args:
|
||||||
|
situation: 情境
|
||||||
|
style: 风格
|
||||||
|
llm: LLM请求实例
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
(suitable, reason, error) 元组,如果出错则 suitable 为 False,error 包含错误信息
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
prompt = create_evaluation_prompt(situation, style)
|
||||||
|
logger.debug(f"正在评估表达方式: situation={situation}, style={style}")
|
||||||
|
|
||||||
|
response, (reasoning, model_name, _) = await llm.generate_response_async(
|
||||||
|
prompt=prompt,
|
||||||
|
temperature=0.6,
|
||||||
|
max_tokens=1024
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.debug(f"LLM响应: {response}")
|
||||||
|
|
||||||
|
# 解析JSON响应
|
||||||
|
try:
|
||||||
|
evaluation = json.loads(response)
|
||||||
|
except json.JSONDecodeError as e:
|
||||||
|
import re
|
||||||
|
json_match = re.search(r'\{[^{}]*"suitable"[^{}]*\}', response, re.DOTALL)
|
||||||
|
if json_match:
|
||||||
|
evaluation = json.loads(json_match.group())
|
||||||
|
else:
|
||||||
|
raise ValueError("无法从响应中提取JSON格式的评估结果") from e
|
||||||
|
|
||||||
|
suitable = evaluation.get("suitable", False)
|
||||||
|
reason = evaluation.get("reason", "未提供理由")
|
||||||
|
|
||||||
|
logger.debug(f"评估结果: {'通过' if suitable else '不通过'}")
|
||||||
|
return suitable, reason, None
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"评估表达方式 (situation={situation}, style={style}) 时出错: {e}")
|
||||||
|
return False, f"评估过程出错: {str(e)}", str(e)
|
||||||
|
|
||||||
|
|
||||||
|
async def evaluate_expression_llm(situation: str, style: str, llm: LLMRequest) -> Dict:
|
||||||
|
"""
|
||||||
|
使用LLM评估单个表达方式
|
||||||
|
|
||||||
|
Args:
|
||||||
|
situation: 情境
|
||||||
|
style: 风格
|
||||||
|
llm: LLM请求实例
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
评估结果字典
|
||||||
|
"""
|
||||||
|
logger.info(f"开始评估表达方式: situation={situation}, style={style}")
|
||||||
|
|
||||||
|
suitable, reason, error = await _single_llm_evaluation(situation, style, llm)
|
||||||
|
|
||||||
|
if error:
|
||||||
|
suitable = False
|
||||||
|
|
||||||
|
logger.info(f"评估完成: {'通过' if suitable else '不通过'}")
|
||||||
|
|
||||||
|
return {
|
||||||
|
"situation": situation,
|
||||||
|
"style": style,
|
||||||
|
"suitable": suitable,
|
||||||
|
"reason": reason,
|
||||||
|
"error": error,
|
||||||
|
"evaluator": "llm"
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def compare_evaluations(manual_results: List[Dict], llm_results: List[Dict], method_name: str) -> Dict:
|
||||||
|
"""
|
||||||
|
对比人工评估和LLM评估的结果
|
||||||
|
|
||||||
|
Args:
|
||||||
|
manual_results: 人工评估结果列表
|
||||||
|
llm_results: LLM评估结果列表
|
||||||
|
method_name: 评估方法名称(用于标识)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
对比分析结果字典
|
||||||
|
"""
|
||||||
|
# 按(situation, style)建立映射
|
||||||
|
llm_dict = {(r["situation"], r["style"]): r for r in llm_results}
|
||||||
|
|
||||||
|
total = len(manual_results)
|
||||||
|
matched = 0
|
||||||
|
true_positives = 0
|
||||||
|
true_negatives = 0
|
||||||
|
false_positives = 0
|
||||||
|
false_negatives = 0
|
||||||
|
|
||||||
|
for manual_result in manual_results:
|
||||||
|
pair = (manual_result["situation"], manual_result["style"])
|
||||||
|
llm_result = llm_dict.get(pair)
|
||||||
|
if llm_result is None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
manual_suitable = manual_result["suitable"]
|
||||||
|
llm_suitable = llm_result["suitable"]
|
||||||
|
|
||||||
|
if manual_suitable == llm_suitable:
|
||||||
|
matched += 1
|
||||||
|
|
||||||
|
if manual_suitable and llm_suitable:
|
||||||
|
true_positives += 1
|
||||||
|
elif not manual_suitable and not llm_suitable:
|
||||||
|
true_negatives += 1
|
||||||
|
elif not manual_suitable and llm_suitable:
|
||||||
|
false_positives += 1
|
||||||
|
elif manual_suitable and not llm_suitable:
|
||||||
|
false_negatives += 1
|
||||||
|
|
||||||
|
accuracy = (matched / total * 100) if total > 0 else 0
|
||||||
|
precision = (true_positives / (true_positives + false_positives) * 100) if (true_positives + false_positives) > 0 else 0
|
||||||
|
recall = (true_positives / (true_positives + false_negatives) * 100) if (true_positives + false_negatives) > 0 else 0
|
||||||
|
f1_score = (2 * precision * recall / (precision + recall)) if (precision + recall) > 0 else 0
|
||||||
|
specificity = (true_negatives / (true_negatives + false_positives) * 100) if (true_negatives + false_positives) > 0 else 0
|
||||||
|
|
||||||
|
# 计算人工效标的不合适率
|
||||||
|
manual_unsuitable_count = true_negatives + false_positives # 人工评估不合适的总数
|
||||||
|
manual_unsuitable_rate = (manual_unsuitable_count / total * 100) if total > 0 else 0
|
||||||
|
|
||||||
|
# 计算经过LLM删除后剩余项目中的不合适率
|
||||||
|
# 在所有项目中,移除LLM判定为不合适的项目后,剩下的项目 = TP + FP(LLM判定为合适的项目)
|
||||||
|
# 在这些剩下的项目中,按人工评定的不合适项目 = FP(人工认为不合适,但LLM认为合适)
|
||||||
|
llm_kept_count = true_positives + false_positives # LLM判定为合适的项目总数(保留的项目)
|
||||||
|
llm_kept_unsuitable_rate = (false_positives / llm_kept_count * 100) if llm_kept_count > 0 else 0
|
||||||
|
|
||||||
|
# 两者百分比相减(评估LLM评定修正后的不合适率是否有降低)
|
||||||
|
rate_difference = manual_unsuitable_rate - llm_kept_unsuitable_rate
|
||||||
|
|
||||||
|
random_baseline = 50.0
|
||||||
|
accuracy_above_random = accuracy - random_baseline
|
||||||
|
accuracy_improvement_ratio = (accuracy / random_baseline) if random_baseline > 0 else 0
|
||||||
|
|
||||||
|
return {
|
||||||
|
"method": method_name,
|
||||||
|
"total": total,
|
||||||
|
"matched": matched,
|
||||||
|
"accuracy": accuracy,
|
||||||
|
"accuracy_above_random": accuracy_above_random,
|
||||||
|
"accuracy_improvement_ratio": accuracy_improvement_ratio,
|
||||||
|
"true_positives": true_positives,
|
||||||
|
"true_negatives": true_negatives,
|
||||||
|
"false_positives": false_positives,
|
||||||
|
"false_negatives": false_negatives,
|
||||||
|
"precision": precision,
|
||||||
|
"recall": recall,
|
||||||
|
"f1_score": f1_score,
|
||||||
|
"specificity": specificity,
|
||||||
|
"manual_unsuitable_rate": manual_unsuitable_rate,
|
||||||
|
"llm_kept_unsuitable_rate": llm_kept_unsuitable_rate,
|
||||||
|
"rate_difference": rate_difference
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
async def main(count: int | None = None):
|
||||||
|
"""
|
||||||
|
主函数
|
||||||
|
|
||||||
|
Args:
|
||||||
|
count: 随机选取的数据条数,如果为None则使用全部数据
|
||||||
|
"""
|
||||||
|
logger.info("=" * 60)
|
||||||
|
logger.info("开始表达方式LLM评估")
|
||||||
|
logger.info("=" * 60)
|
||||||
|
|
||||||
|
# 1. 加载人工评估结果
|
||||||
|
print("\n步骤1: 加载人工评估结果")
|
||||||
|
manual_results = load_manual_results()
|
||||||
|
if not manual_results:
|
||||||
|
return
|
||||||
|
|
||||||
|
print(f"成功加载 {len(manual_results)} 条人工评估结果")
|
||||||
|
|
||||||
|
# 如果指定了数量,随机选择指定数量的数据
|
||||||
|
if count is not None:
|
||||||
|
if count <= 0:
|
||||||
|
print(f"\n✗ 错误:指定的数量必须大于0,当前值: {count}")
|
||||||
|
return
|
||||||
|
if count > len(manual_results):
|
||||||
|
print(f"\n⚠ 警告:指定的数量 ({count}) 大于可用数据量 ({len(manual_results)}),将使用全部数据")
|
||||||
|
else:
|
||||||
|
random.seed() # 使用系统时间作为随机种子
|
||||||
|
manual_results = random.sample(manual_results, count)
|
||||||
|
print(f"随机选取 {len(manual_results)} 条数据进行评估")
|
||||||
|
|
||||||
|
# 验证数据完整性
|
||||||
|
valid_manual_results = []
|
||||||
|
for r in manual_results:
|
||||||
|
if "situation" in r and "style" in r:
|
||||||
|
valid_manual_results.append(r)
|
||||||
|
else:
|
||||||
|
logger.warning(f"跳过无效数据: {r}")
|
||||||
|
|
||||||
|
if len(valid_manual_results) != len(manual_results):
|
||||||
|
print(f"警告:{len(manual_results) - len(valid_manual_results)} 条数据缺少必要字段,已跳过")
|
||||||
|
|
||||||
|
print(f"有效数据: {len(valid_manual_results)} 条")
|
||||||
|
|
||||||
|
# 2. 创建LLM实例并评估
|
||||||
|
print("\n步骤2: 创建LLM实例")
|
||||||
|
try:
|
||||||
|
llm = LLMRequest(
|
||||||
|
model_set=model_config.model_task_config.tool_use,
|
||||||
|
request_type="expression_evaluator_llm"
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"创建LLM实例失败: {e}")
|
||||||
|
import traceback
|
||||||
|
logger.error(traceback.format_exc())
|
||||||
|
return
|
||||||
|
|
||||||
|
print("\n步骤3: 开始LLM评估")
|
||||||
|
llm_results = []
|
||||||
|
for i, manual_result in enumerate(valid_manual_results, 1):
|
||||||
|
print(f"LLM评估进度: {i}/{len(valid_manual_results)}")
|
||||||
|
llm_results.append(await evaluate_expression_llm(
|
||||||
|
manual_result["situation"],
|
||||||
|
manual_result["style"],
|
||||||
|
llm
|
||||||
|
))
|
||||||
|
await asyncio.sleep(0.3)
|
||||||
|
|
||||||
|
# 5. 输出FP和FN项目(在评估结果之前)
|
||||||
|
llm_dict = {(r["situation"], r["style"]): r for r in llm_results}
|
||||||
|
|
||||||
|
# 5.1 输出FP项目(人工评估不通过但LLM误判为通过)
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("人工评估不通过但LLM误判为通过的项目(FP - False Positive)")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
fp_items = []
|
||||||
|
for manual_result in valid_manual_results:
|
||||||
|
pair = (manual_result["situation"], manual_result["style"])
|
||||||
|
llm_result = llm_dict.get(pair)
|
||||||
|
if llm_result is None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 人工评估不通过,但LLM评估通过(FP情况)
|
||||||
|
if not manual_result["suitable"] and llm_result["suitable"]:
|
||||||
|
fp_items.append({
|
||||||
|
"situation": manual_result["situation"],
|
||||||
|
"style": manual_result["style"],
|
||||||
|
"manual_suitable": manual_result["suitable"],
|
||||||
|
"llm_suitable": llm_result["suitable"],
|
||||||
|
"llm_reason": llm_result.get("reason", "未提供理由"),
|
||||||
|
"llm_error": llm_result.get("error")
|
||||||
|
})
|
||||||
|
|
||||||
|
if fp_items:
|
||||||
|
print(f"\n共找到 {len(fp_items)} 条误判项目:\n")
|
||||||
|
for idx, item in enumerate(fp_items, 1):
|
||||||
|
print(f"--- [{idx}] ---")
|
||||||
|
print(f"Situation: {item['situation']}")
|
||||||
|
print(f"Style: {item['style']}")
|
||||||
|
print("人工评估: 不通过 ❌")
|
||||||
|
print("LLM评估: 通过 ✅ (误判)")
|
||||||
|
if item.get('llm_error'):
|
||||||
|
print(f"LLM错误: {item['llm_error']}")
|
||||||
|
print(f"LLM理由: {item['llm_reason']}")
|
||||||
|
print()
|
||||||
|
else:
|
||||||
|
print("\n✓ 没有误判项目(所有人工评估不通过的项目都被LLM正确识别为不通过)")
|
||||||
|
|
||||||
|
# 5.2 输出FN项目(人工评估通过但LLM误判为不通过)
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("人工评估通过但LLM误判为不通过的项目(FN - False Negative)")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
fn_items = []
|
||||||
|
for manual_result in valid_manual_results:
|
||||||
|
pair = (manual_result["situation"], manual_result["style"])
|
||||||
|
llm_result = llm_dict.get(pair)
|
||||||
|
if llm_result is None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 人工评估通过,但LLM评估不通过(FN情况)
|
||||||
|
if manual_result["suitable"] and not llm_result["suitable"]:
|
||||||
|
fn_items.append({
|
||||||
|
"situation": manual_result["situation"],
|
||||||
|
"style": manual_result["style"],
|
||||||
|
"manual_suitable": manual_result["suitable"],
|
||||||
|
"llm_suitable": llm_result["suitable"],
|
||||||
|
"llm_reason": llm_result.get("reason", "未提供理由"),
|
||||||
|
"llm_error": llm_result.get("error")
|
||||||
|
})
|
||||||
|
|
||||||
|
if fn_items:
|
||||||
|
print(f"\n共找到 {len(fn_items)} 条误删项目:\n")
|
||||||
|
for idx, item in enumerate(fn_items, 1):
|
||||||
|
print(f"--- [{idx}] ---")
|
||||||
|
print(f"Situation: {item['situation']}")
|
||||||
|
print(f"Style: {item['style']}")
|
||||||
|
print("人工评估: 通过 ✅")
|
||||||
|
print("LLM评估: 不通过 ❌ (误删)")
|
||||||
|
if item.get('llm_error'):
|
||||||
|
print(f"LLM错误: {item['llm_error']}")
|
||||||
|
print(f"LLM理由: {item['llm_reason']}")
|
||||||
|
print()
|
||||||
|
else:
|
||||||
|
print("\n✓ 没有误删项目(所有人工评估通过的项目都被LLM正确识别为通过)")
|
||||||
|
|
||||||
|
# 6. 对比分析并输出结果
|
||||||
|
comparison = compare_evaluations(valid_manual_results, llm_results, "LLM评估")
|
||||||
|
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("评估结果(以人工评估为标准)")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
# 详细评估结果(核心指标优先)
|
||||||
|
print(f"\n--- {comparison['method']} ---")
|
||||||
|
print(f" 总数: {comparison['total']} 条")
|
||||||
|
print()
|
||||||
|
# print(" 【核心能力指标】")
|
||||||
|
print(f" 特定负类召回率: {comparison['specificity']:.2f}% (将不合适项目正确提取出来的能力)")
|
||||||
|
print(f" - 计算: TN / (TN + FP) = {comparison['true_negatives']} / ({comparison['true_negatives']} + {comparison['false_positives']})")
|
||||||
|
print(f" - 含义: 在 {comparison['true_negatives'] + comparison['false_positives']} 个实际不合适的项目中,正确识别出 {comparison['true_negatives']} 个")
|
||||||
|
# print(f" - 随机水平: 50.00% (当前高于随机: {comparison['specificity'] - 50.0:+.2f}%)")
|
||||||
|
print()
|
||||||
|
print(f" 召回率: {comparison['recall']:.2f}% (尽可能少的误删合适项目的能力)")
|
||||||
|
print(f" - 计算: TP / (TP + FN) = {comparison['true_positives']} / ({comparison['true_positives']} + {comparison['false_negatives']})")
|
||||||
|
print(f" - 含义: 在 {comparison['true_positives'] + comparison['false_negatives']} 个实际合适的项目中,正确识别出 {comparison['true_positives']} 个")
|
||||||
|
# print(f" - 随机水平: 50.00% (当前高于随机: {comparison['recall'] - 50.0:+.2f}%)")
|
||||||
|
print()
|
||||||
|
print(" 【其他指标】")
|
||||||
|
print(f" 准确率: {comparison['accuracy']:.2f}% (整体判断正确率)")
|
||||||
|
print(f" 精确率: {comparison['precision']:.2f}% (判断为合适的项目中,实际合适的比例)")
|
||||||
|
print(f" F1分数: {comparison['f1_score']:.2f} (精确率和召回率的调和平均)")
|
||||||
|
print(f" 匹配数: {comparison['matched']}/{comparison['total']}")
|
||||||
|
print()
|
||||||
|
print(" 【不合适率分析】")
|
||||||
|
print(f" 人工效标的不合适率: {comparison['manual_unsuitable_rate']:.2f}%")
|
||||||
|
print(f" - 计算: (TN + FP) / 总数 = ({comparison['true_negatives']} + {comparison['false_positives']}) / {comparison['total']}")
|
||||||
|
print(f" - 含义: 在人工评估中,有 {comparison['manual_unsuitable_rate']:.2f}% 的项目被判定为不合适")
|
||||||
|
print()
|
||||||
|
print(f" 经过LLM删除后剩余项目中的不合适率: {comparison['llm_kept_unsuitable_rate']:.2f}%")
|
||||||
|
print(f" - 计算: FP / (TP + FP) = {comparison['false_positives']} / ({comparison['true_positives']} + {comparison['false_positives']})")
|
||||||
|
print(f" - 含义: 在所有项目中,移除LLM判定为不合适的项目后,在剩下的 {comparison['true_positives'] + comparison['false_positives']} 个项目中,人工认为不合适的项目占 {comparison['llm_kept_unsuitable_rate']:.2f}%")
|
||||||
|
print()
|
||||||
|
# print(f" 两者百分比差值: {comparison['rate_difference']:+.2f}%")
|
||||||
|
# print(f" - 计算: 人工效标不合适率 - LLM删除后剩余项目不合适率 = {comparison['manual_unsuitable_rate']:.2f}% - {comparison['llm_kept_unsuitable_rate']:.2f}%")
|
||||||
|
# print(f" - 含义: {'LLM删除后剩余项目中的不合适率降低了' if comparison['rate_difference'] > 0 else 'LLM删除后剩余项目中的不合适率反而升高了' if comparison['rate_difference'] < 0 else '两者相等'} ({'✓ LLM删除有效' if comparison['rate_difference'] > 0 else '✗ LLM删除效果不佳' if comparison['rate_difference'] < 0 else '效果相同'})")
|
||||||
|
# print()
|
||||||
|
print(" 【分类统计】")
|
||||||
|
print(f" TP (正确识别为合适): {comparison['true_positives']}")
|
||||||
|
print(f" TN (正确识别为不合适): {comparison['true_negatives']} ⭐")
|
||||||
|
print(f" FP (误判为合适): {comparison['false_positives']} ⚠️")
|
||||||
|
print(f" FN (误删合适项目): {comparison['false_negatives']} ⚠️")
|
||||||
|
|
||||||
|
# 7. 保存结果到JSON文件
|
||||||
|
output_file = os.path.join(project_root, "data", "expression_evaluation_llm.json")
|
||||||
|
try:
|
||||||
|
os.makedirs(os.path.dirname(output_file), exist_ok=True)
|
||||||
|
with open(output_file, "w", encoding="utf-8") as f:
|
||||||
|
json.dump({
|
||||||
|
"manual_results": valid_manual_results,
|
||||||
|
"llm_results": llm_results,
|
||||||
|
"comparison": comparison
|
||||||
|
}, f, ensure_ascii=False, indent=2)
|
||||||
|
logger.info(f"\n评估结果已保存到: {output_file}")
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"保存结果到文件失败: {e}")
|
||||||
|
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("评估完成")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="表达方式LLM评估脚本",
|
||||||
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||||
|
epilog="""
|
||||||
|
示例:
|
||||||
|
python evaluate_expressions_llm_v6.py # 使用全部数据
|
||||||
|
python evaluate_expressions_llm_v6.py -n 50 # 随机选取50条数据
|
||||||
|
python evaluate_expressions_llm_v6.py --count 100 # 随机选取100条数据
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-n", "--count",
|
||||||
|
type=int,
|
||||||
|
default=None,
|
||||||
|
help="随机选取的数据条数(默认:使用全部数据)"
|
||||||
|
)
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
asyncio.run(main(count=args.count))
|
||||||
|
|
||||||
|
|
@ -0,0 +1,278 @@
|
||||||
|
"""
|
||||||
|
表达方式人工评估脚本
|
||||||
|
|
||||||
|
功能:
|
||||||
|
1. 不停随机抽取项目(不重复)进行人工评估
|
||||||
|
2. 将结果保存到 temp 文件夹下的 JSON 文件,作为效标(标准答案)
|
||||||
|
3. 支持继续评估(从已有文件中读取已评估的项目,避免重复)
|
||||||
|
"""
|
||||||
|
|
||||||
|
import random
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
from typing import List, Dict, Set, Tuple
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
# 添加项目根目录到路径
|
||||||
|
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
|
||||||
|
sys.path.insert(0, project_root)
|
||||||
|
|
||||||
|
from src.common.database.database_model import Expression
|
||||||
|
from src.common.database.database import db
|
||||||
|
from src.common.logger import get_logger
|
||||||
|
|
||||||
|
logger = get_logger("expression_evaluator_manual")
|
||||||
|
|
||||||
|
# 评估结果文件路径
|
||||||
|
TEMP_DIR = os.path.join(os.path.dirname(__file__), "temp")
|
||||||
|
MANUAL_EVAL_FILE = os.path.join(TEMP_DIR, "manual_evaluation_results.json")
|
||||||
|
|
||||||
|
|
||||||
|
def load_existing_results() -> tuple[List[Dict], Set[Tuple[str, str]]]:
|
||||||
|
"""
|
||||||
|
加载已有的评估结果
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
(已有结果列表, 已评估的项目(situation, style)元组集合)
|
||||||
|
"""
|
||||||
|
if not os.path.exists(MANUAL_EVAL_FILE):
|
||||||
|
return [], set()
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(MANUAL_EVAL_FILE, "r", encoding="utf-8") as f:
|
||||||
|
data = json.load(f)
|
||||||
|
results = data.get("manual_results", [])
|
||||||
|
# 使用 (situation, style) 作为唯一标识
|
||||||
|
evaluated_pairs = {(r["situation"], r["style"]) for r in results if "situation" in r and "style" in r}
|
||||||
|
logger.info(f"已加载 {len(results)} 条已有评估结果")
|
||||||
|
return results, evaluated_pairs
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"加载已有评估结果失败: {e}")
|
||||||
|
return [], set()
|
||||||
|
|
||||||
|
|
||||||
|
def save_results(manual_results: List[Dict]):
|
||||||
|
"""
|
||||||
|
保存评估结果到文件
|
||||||
|
|
||||||
|
Args:
|
||||||
|
manual_results: 评估结果列表
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
os.makedirs(TEMP_DIR, exist_ok=True)
|
||||||
|
|
||||||
|
data = {
|
||||||
|
"last_updated": datetime.now().isoformat(),
|
||||||
|
"total_count": len(manual_results),
|
||||||
|
"manual_results": manual_results
|
||||||
|
}
|
||||||
|
|
||||||
|
with open(MANUAL_EVAL_FILE, "w", encoding="utf-8") as f:
|
||||||
|
json.dump(data, f, ensure_ascii=False, indent=2)
|
||||||
|
|
||||||
|
logger.info(f"评估结果已保存到: {MANUAL_EVAL_FILE}")
|
||||||
|
print(f"\n✓ 评估结果已保存(共 {len(manual_results)} 条)")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"保存评估结果失败: {e}")
|
||||||
|
print(f"\n✗ 保存评估结果失败: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
def get_unevaluated_expressions(evaluated_pairs: Set[Tuple[str, str]], batch_size: int = 10) -> List[Expression]:
|
||||||
|
"""
|
||||||
|
获取未评估的表达方式
|
||||||
|
|
||||||
|
Args:
|
||||||
|
evaluated_pairs: 已评估的项目(situation, style)元组集合
|
||||||
|
batch_size: 每次获取的数量
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
未评估的表达方式列表
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# 查询所有表达方式
|
||||||
|
all_expressions = list(Expression.select())
|
||||||
|
|
||||||
|
if not all_expressions:
|
||||||
|
logger.warning("数据库中没有表达方式记录")
|
||||||
|
return []
|
||||||
|
|
||||||
|
# 过滤出未评估的项目:匹配 situation 和 style 均一致
|
||||||
|
unevaluated = [
|
||||||
|
expr for expr in all_expressions
|
||||||
|
if (expr.situation, expr.style) not in evaluated_pairs
|
||||||
|
]
|
||||||
|
|
||||||
|
if not unevaluated:
|
||||||
|
logger.info("所有项目都已评估完成")
|
||||||
|
return []
|
||||||
|
|
||||||
|
# 如果未评估数量少于请求数量,返回所有
|
||||||
|
if len(unevaluated) <= batch_size:
|
||||||
|
logger.info(f"剩余 {len(unevaluated)} 条未评估项目,全部返回")
|
||||||
|
return unevaluated
|
||||||
|
|
||||||
|
# 随机选择指定数量
|
||||||
|
selected = random.sample(unevaluated, batch_size)
|
||||||
|
logger.info(f"从 {len(unevaluated)} 条未评估项目中随机选择了 {len(selected)} 条")
|
||||||
|
return selected
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"获取未评估表达方式失败: {e}")
|
||||||
|
import traceback
|
||||||
|
logger.error(traceback.format_exc())
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
def manual_evaluate_expression(expression: Expression, index: int, total: int) -> Dict:
|
||||||
|
"""
|
||||||
|
人工评估单个表达方式
|
||||||
|
|
||||||
|
Args:
|
||||||
|
expression: 表达方式对象
|
||||||
|
index: 当前索引(从1开始)
|
||||||
|
total: 总数
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
评估结果字典,如果用户退出则返回 None
|
||||||
|
"""
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print(f"人工评估 [{index}/{total}]")
|
||||||
|
print("=" * 60)
|
||||||
|
print(f"Situation: {expression.situation}")
|
||||||
|
print(f"Style: {expression.style}")
|
||||||
|
print("\n请评估该表达方式是否合适:")
|
||||||
|
print(" 输入 'y' 或 'yes' 或 '1' 表示合适(通过)")
|
||||||
|
print(" 输入 'n' 或 'no' 或 '0' 表示不合适(不通过)")
|
||||||
|
print(" 输入 'q' 或 'quit' 退出评估")
|
||||||
|
print(" 输入 's' 或 'skip' 跳过当前项目")
|
||||||
|
|
||||||
|
while True:
|
||||||
|
user_input = input("\n您的评估 (y/n/q/s): ").strip().lower()
|
||||||
|
|
||||||
|
if user_input in ['q', 'quit']:
|
||||||
|
print("退出评估")
|
||||||
|
return None
|
||||||
|
|
||||||
|
if user_input in ['s', 'skip']:
|
||||||
|
print("跳过当前项目")
|
||||||
|
return "skip"
|
||||||
|
|
||||||
|
if user_input in ['y', 'yes', '1', '是', '通过']:
|
||||||
|
suitable = True
|
||||||
|
break
|
||||||
|
elif user_input in ['n', 'no', '0', '否', '不通过']:
|
||||||
|
suitable = False
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
print("输入无效,请重新输入 (y/n/q/s)")
|
||||||
|
|
||||||
|
result = {
|
||||||
|
"situation": expression.situation,
|
||||||
|
"style": expression.style,
|
||||||
|
"suitable": suitable,
|
||||||
|
"reason": None,
|
||||||
|
"evaluator": "manual",
|
||||||
|
"evaluated_at": datetime.now().isoformat()
|
||||||
|
}
|
||||||
|
|
||||||
|
print(f"\n✓ 已记录:{'通过' if suitable else '不通过'}")
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""主函数"""
|
||||||
|
logger.info("=" * 60)
|
||||||
|
logger.info("开始表达方式人工评估")
|
||||||
|
logger.info("=" * 60)
|
||||||
|
|
||||||
|
# 初始化数据库连接
|
||||||
|
try:
|
||||||
|
db.connect(reuse_if_open=True)
|
||||||
|
logger.info("数据库连接成功")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"数据库连接失败: {e}")
|
||||||
|
return
|
||||||
|
|
||||||
|
# 加载已有评估结果
|
||||||
|
existing_results, evaluated_pairs = load_existing_results()
|
||||||
|
manual_results = existing_results.copy()
|
||||||
|
|
||||||
|
if evaluated_pairs:
|
||||||
|
print(f"\n已加载 {len(existing_results)} 条已有评估结果")
|
||||||
|
print(f"已评估项目数: {len(evaluated_pairs)}")
|
||||||
|
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("开始人工评估")
|
||||||
|
print("=" * 60)
|
||||||
|
print("提示:可以随时输入 'q' 退出,输入 's' 跳过当前项目")
|
||||||
|
print("评估结果会自动保存到文件\n")
|
||||||
|
|
||||||
|
batch_size = 10
|
||||||
|
batch_count = 0
|
||||||
|
|
||||||
|
while True:
|
||||||
|
# 获取未评估的项目
|
||||||
|
expressions = get_unevaluated_expressions(evaluated_pairs, batch_size)
|
||||||
|
|
||||||
|
if not expressions:
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("所有项目都已评估完成!")
|
||||||
|
print("=" * 60)
|
||||||
|
break
|
||||||
|
|
||||||
|
batch_count += 1
|
||||||
|
print(f"\n--- 批次 {batch_count}:评估 {len(expressions)} 条项目 ---")
|
||||||
|
|
||||||
|
batch_results = []
|
||||||
|
for i, expression in enumerate(expressions, 1):
|
||||||
|
manual_result = manual_evaluate_expression(expression, i, len(expressions))
|
||||||
|
|
||||||
|
if manual_result is None:
|
||||||
|
# 用户退出
|
||||||
|
print("\n评估已中断")
|
||||||
|
if batch_results:
|
||||||
|
# 保存当前批次的结果
|
||||||
|
manual_results.extend(batch_results)
|
||||||
|
save_results(manual_results)
|
||||||
|
return
|
||||||
|
|
||||||
|
if manual_result == "skip":
|
||||||
|
# 跳过当前项目
|
||||||
|
continue
|
||||||
|
|
||||||
|
batch_results.append(manual_result)
|
||||||
|
# 使用 (situation, style) 作为唯一标识
|
||||||
|
evaluated_pairs.add((manual_result["situation"], manual_result["style"]))
|
||||||
|
|
||||||
|
# 将当前批次结果添加到总结果中
|
||||||
|
manual_results.extend(batch_results)
|
||||||
|
|
||||||
|
# 保存结果
|
||||||
|
save_results(manual_results)
|
||||||
|
|
||||||
|
print(f"\n当前批次完成,已评估总数: {len(manual_results)} 条")
|
||||||
|
|
||||||
|
# 询问是否继续
|
||||||
|
while True:
|
||||||
|
continue_input = input("\n是否继续评估下一批?(y/n): ").strip().lower()
|
||||||
|
if continue_input in ['y', 'yes', '1', '是', '继续']:
|
||||||
|
break
|
||||||
|
elif continue_input in ['n', 'no', '0', '否', '退出']:
|
||||||
|
print("\n评估结束")
|
||||||
|
return
|
||||||
|
else:
|
||||||
|
print("输入无效,请重新输入 (y/n)")
|
||||||
|
|
||||||
|
# 关闭数据库连接
|
||||||
|
try:
|
||||||
|
db.close()
|
||||||
|
logger.info("数据库连接已关闭")
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"关闭数据库连接时出错: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
|
|
||||||
|
|
@ -0,0 +1,476 @@
|
||||||
|
"""
|
||||||
|
表达方式评估脚本
|
||||||
|
|
||||||
|
功能:
|
||||||
|
1. 随机读取指定数量的表达方式,获取其situation和style
|
||||||
|
2. 先进行人工评估(逐条手动评估)
|
||||||
|
3. 然后使用LLM进行评估
|
||||||
|
4. 对比人工评估和LLM评估的正确率、精确率、召回率、F1分数等指标(以人工评估为标准)
|
||||||
|
5. 不真正修改数据库,只是做评估
|
||||||
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import random
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
from typing import List, Dict
|
||||||
|
|
||||||
|
# 添加项目根目录到路径
|
||||||
|
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
|
||||||
|
sys.path.insert(0, project_root)
|
||||||
|
|
||||||
|
from src.common.database.database_model import Expression
|
||||||
|
from src.common.database.database import db
|
||||||
|
from src.llm_models.utils_model import LLMRequest
|
||||||
|
from src.config.config import model_config
|
||||||
|
from src.common.logger import get_logger
|
||||||
|
|
||||||
|
logger = get_logger("expression_evaluator_comparison")
|
||||||
|
|
||||||
|
|
||||||
|
def get_random_expressions(count: int = 10) -> List[Expression]:
|
||||||
|
"""
|
||||||
|
随机读取指定数量的表达方式
|
||||||
|
|
||||||
|
Args:
|
||||||
|
count: 要读取的数量,默认10条
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
表达方式列表
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# 查询所有表达方式
|
||||||
|
all_expressions = list(Expression.select())
|
||||||
|
|
||||||
|
if not all_expressions:
|
||||||
|
logger.warning("数据库中没有表达方式记录")
|
||||||
|
return []
|
||||||
|
|
||||||
|
# 如果总数少于请求数量,返回所有
|
||||||
|
if len(all_expressions) <= count:
|
||||||
|
logger.info(f"数据库中共有 {len(all_expressions)} 条表达方式,全部返回")
|
||||||
|
return all_expressions
|
||||||
|
|
||||||
|
# 随机选择指定数量
|
||||||
|
selected = random.sample(all_expressions, count)
|
||||||
|
logger.info(f"从 {len(all_expressions)} 条表达方式中随机选择了 {len(selected)} 条")
|
||||||
|
return selected
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"随机读取表达方式失败: {e}")
|
||||||
|
import traceback
|
||||||
|
logger.error(traceback.format_exc())
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
def manual_evaluate_expression(expression: Expression, index: int, total: int) -> Dict:
|
||||||
|
"""
|
||||||
|
人工评估单个表达方式
|
||||||
|
|
||||||
|
Args:
|
||||||
|
expression: 表达方式对象
|
||||||
|
index: 当前索引(从1开始)
|
||||||
|
total: 总数
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
评估结果字典,包含:
|
||||||
|
- expression_id: 表达方式ID
|
||||||
|
- situation: 情境
|
||||||
|
- style: 风格
|
||||||
|
- suitable: 是否合适(人工评估)
|
||||||
|
- reason: 评估理由(始终为None)
|
||||||
|
"""
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print(f"人工评估 [{index}/{total}]")
|
||||||
|
print("=" * 60)
|
||||||
|
print(f"Situation: {expression.situation}")
|
||||||
|
print(f"Style: {expression.style}")
|
||||||
|
print("\n请评估该表达方式是否合适:")
|
||||||
|
print(" 输入 'y' 或 'yes' 或 '1' 表示合适(通过)")
|
||||||
|
print(" 输入 'n' 或 'no' 或 '0' 表示不合适(不通过)")
|
||||||
|
print(" 输入 'q' 或 'quit' 退出评估")
|
||||||
|
|
||||||
|
while True:
|
||||||
|
user_input = input("\n您的评估 (y/n/q): ").strip().lower()
|
||||||
|
|
||||||
|
if user_input in ['q', 'quit']:
|
||||||
|
print("退出评估")
|
||||||
|
return None
|
||||||
|
|
||||||
|
if user_input in ['y', 'yes', '1', '是', '通过']:
|
||||||
|
suitable = True
|
||||||
|
break
|
||||||
|
elif user_input in ['n', 'no', '0', '否', '不通过']:
|
||||||
|
suitable = False
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
print("输入无效,请重新输入 (y/n/q)")
|
||||||
|
|
||||||
|
result = {
|
||||||
|
"expression_id": expression.id,
|
||||||
|
"situation": expression.situation,
|
||||||
|
"style": expression.style,
|
||||||
|
"suitable": suitable,
|
||||||
|
"reason": None,
|
||||||
|
"evaluator": "manual"
|
||||||
|
}
|
||||||
|
|
||||||
|
print(f"\n✓ 已记录:{'通过' if suitable else '不通过'}")
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def create_evaluation_prompt(situation: str, style: str) -> str:
|
||||||
|
"""
|
||||||
|
创建评估提示词
|
||||||
|
|
||||||
|
Args:
|
||||||
|
situation: 情境
|
||||||
|
style: 风格
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
评估提示词
|
||||||
|
"""
|
||||||
|
prompt = f"""请评估以下表达方式是否合适:
|
||||||
|
|
||||||
|
情境(situation):{situation}
|
||||||
|
风格(style):{style}
|
||||||
|
|
||||||
|
请从以下方面进行评估:
|
||||||
|
1. 情境描述是否清晰、准确
|
||||||
|
2. 风格表达是否合理、自然
|
||||||
|
3. 情境和风格是否匹配
|
||||||
|
4. 允许部分语法错误出现
|
||||||
|
5. 允许口头化或缺省表达
|
||||||
|
6. 允许部分上下文缺失
|
||||||
|
|
||||||
|
请以JSON格式输出评估结果:
|
||||||
|
{{
|
||||||
|
"suitable": true/false,
|
||||||
|
"reason": "评估理由(如果不合适,请说明原因)"
|
||||||
|
}}
|
||||||
|
|
||||||
|
如果合适,suitable设为true;如果不合适,suitable设为false,并在reason中说明原因。
|
||||||
|
请严格按照JSON格式输出,不要包含其他内容。"""
|
||||||
|
|
||||||
|
return prompt
|
||||||
|
|
||||||
|
|
||||||
|
async def _single_llm_evaluation(expression: Expression, llm: LLMRequest) -> tuple[bool, str, str | None]:
|
||||||
|
"""
|
||||||
|
执行单次LLM评估
|
||||||
|
|
||||||
|
Args:
|
||||||
|
expression: 表达方式对象
|
||||||
|
llm: LLM请求实例
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
(suitable, reason, error) 元组,如果出错则 suitable 为 False,error 包含错误信息
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
prompt = create_evaluation_prompt(expression.situation, expression.style)
|
||||||
|
logger.debug(f"正在评估表达方式 ID: {expression.id}")
|
||||||
|
|
||||||
|
response, (reasoning, model_name, _) = await llm.generate_response_async(
|
||||||
|
prompt=prompt,
|
||||||
|
temperature=0.6,
|
||||||
|
max_tokens=1024
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.debug(f"LLM响应: {response}")
|
||||||
|
|
||||||
|
# 解析JSON响应
|
||||||
|
try:
|
||||||
|
evaluation = json.loads(response)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
import re
|
||||||
|
json_match = re.search(r'\{[^{}]*"suitable"[^{}]*\}', response, re.DOTALL)
|
||||||
|
if json_match:
|
||||||
|
evaluation = json.loads(json_match.group())
|
||||||
|
else:
|
||||||
|
raise ValueError("无法从响应中提取JSON格式的评估结果")
|
||||||
|
|
||||||
|
suitable = evaluation.get("suitable", False)
|
||||||
|
reason = evaluation.get("reason", "未提供理由")
|
||||||
|
|
||||||
|
logger.debug(f"评估结果: {'通过' if suitable else '不通过'}")
|
||||||
|
return suitable, reason, None
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"评估表达方式 ID: {expression.id} 时出错: {e}")
|
||||||
|
return False, f"评估过程出错: {str(e)}", str(e)
|
||||||
|
|
||||||
|
|
||||||
|
async def evaluate_expression_llm(expression: Expression, llm: LLMRequest) -> Dict:
|
||||||
|
"""
|
||||||
|
使用LLM评估单个表达方式
|
||||||
|
|
||||||
|
Args:
|
||||||
|
expression: 表达方式对象
|
||||||
|
llm: LLM请求实例
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
评估结果字典
|
||||||
|
"""
|
||||||
|
logger.info(f"开始评估表达方式 ID: {expression.id}")
|
||||||
|
|
||||||
|
suitable, reason, error = await _single_llm_evaluation(expression, llm)
|
||||||
|
|
||||||
|
if error:
|
||||||
|
suitable = False
|
||||||
|
|
||||||
|
logger.info(f"评估完成: {'通过' if suitable else '不通过'}")
|
||||||
|
|
||||||
|
return {
|
||||||
|
"expression_id": expression.id,
|
||||||
|
"situation": expression.situation,
|
||||||
|
"style": expression.style,
|
||||||
|
"suitable": suitable,
|
||||||
|
"reason": reason,
|
||||||
|
"error": error,
|
||||||
|
"evaluator": "llm"
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def compare_evaluations(manual_results: List[Dict], llm_results: List[Dict], method_name: str) -> Dict:
|
||||||
|
"""
|
||||||
|
对比人工评估和LLM评估的结果
|
||||||
|
|
||||||
|
Args:
|
||||||
|
manual_results: 人工评估结果列表
|
||||||
|
llm_results: LLM评估结果列表
|
||||||
|
method_name: 评估方法名称(用于标识)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
对比分析结果字典
|
||||||
|
"""
|
||||||
|
# 按expression_id建立映射
|
||||||
|
llm_dict = {r["expression_id"]: r for r in llm_results}
|
||||||
|
|
||||||
|
total = len(manual_results)
|
||||||
|
matched = 0
|
||||||
|
true_positives = 0
|
||||||
|
true_negatives = 0
|
||||||
|
false_positives = 0
|
||||||
|
false_negatives = 0
|
||||||
|
|
||||||
|
for manual_result in manual_results:
|
||||||
|
llm_result = llm_dict.get(manual_result["expression_id"])
|
||||||
|
if llm_result is None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
manual_suitable = manual_result["suitable"]
|
||||||
|
llm_suitable = llm_result["suitable"]
|
||||||
|
|
||||||
|
if manual_suitable == llm_suitable:
|
||||||
|
matched += 1
|
||||||
|
|
||||||
|
if manual_suitable and llm_suitable:
|
||||||
|
true_positives += 1
|
||||||
|
elif not manual_suitable and not llm_suitable:
|
||||||
|
true_negatives += 1
|
||||||
|
elif not manual_suitable and llm_suitable:
|
||||||
|
false_positives += 1
|
||||||
|
elif manual_suitable and not llm_suitable:
|
||||||
|
false_negatives += 1
|
||||||
|
|
||||||
|
accuracy = (matched / total * 100) if total > 0 else 0
|
||||||
|
precision = (true_positives / (true_positives + false_positives) * 100) if (true_positives + false_positives) > 0 else 0
|
||||||
|
recall = (true_positives / (true_positives + false_negatives) * 100) if (true_positives + false_negatives) > 0 else 0
|
||||||
|
f1_score = (2 * precision * recall / (precision + recall)) if (precision + recall) > 0 else 0
|
||||||
|
specificity = (true_negatives / (true_negatives + false_positives) * 100) if (true_negatives + false_positives) > 0 else 0
|
||||||
|
|
||||||
|
random_baseline = 50.0
|
||||||
|
accuracy_above_random = accuracy - random_baseline
|
||||||
|
accuracy_improvement_ratio = (accuracy / random_baseline) if random_baseline > 0 else 0
|
||||||
|
|
||||||
|
return {
|
||||||
|
"method": method_name,
|
||||||
|
"total": total,
|
||||||
|
"matched": matched,
|
||||||
|
"accuracy": accuracy,
|
||||||
|
"accuracy_above_random": accuracy_above_random,
|
||||||
|
"accuracy_improvement_ratio": accuracy_improvement_ratio,
|
||||||
|
"true_positives": true_positives,
|
||||||
|
"true_negatives": true_negatives,
|
||||||
|
"false_positives": false_positives,
|
||||||
|
"false_negatives": false_negatives,
|
||||||
|
"precision": precision,
|
||||||
|
"recall": recall,
|
||||||
|
"f1_score": f1_score,
|
||||||
|
"specificity": specificity
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
"""主函数"""
|
||||||
|
logger.info("=" * 60)
|
||||||
|
logger.info("开始表达方式评估")
|
||||||
|
logger.info("=" * 60)
|
||||||
|
|
||||||
|
# 初始化数据库连接
|
||||||
|
try:
|
||||||
|
db.connect(reuse_if_open=True)
|
||||||
|
logger.info("数据库连接成功")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"数据库连接失败: {e}")
|
||||||
|
return
|
||||||
|
|
||||||
|
# 1. 随机读取表达方式
|
||||||
|
logger.info("\n步骤1: 随机读取表达方式")
|
||||||
|
expressions = get_random_expressions(10)
|
||||||
|
if not expressions:
|
||||||
|
logger.error("没有可用的表达方式,退出")
|
||||||
|
return
|
||||||
|
logger.info(f"成功读取 {len(expressions)} 条表达方式")
|
||||||
|
|
||||||
|
# 2. 人工评估
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("开始人工评估")
|
||||||
|
print("=" * 60)
|
||||||
|
print(f"共需要评估 {len(expressions)} 条表达方式")
|
||||||
|
print("请逐条进行评估...\n")
|
||||||
|
|
||||||
|
manual_results = []
|
||||||
|
for i, expression in enumerate(expressions, 1):
|
||||||
|
manual_result = manual_evaluate_expression(expression, i, len(expressions))
|
||||||
|
if manual_result is None:
|
||||||
|
print("\n评估已中断")
|
||||||
|
return
|
||||||
|
manual_results.append(manual_result)
|
||||||
|
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("人工评估完成")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
# 3. 创建LLM实例并评估
|
||||||
|
logger.info("\n步骤3: 创建LLM实例")
|
||||||
|
try:
|
||||||
|
llm = LLMRequest(
|
||||||
|
model_set=model_config.model_task_config.tool_use,
|
||||||
|
request_type="expression_evaluator_comparison"
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"创建LLM实例失败: {e}")
|
||||||
|
import traceback
|
||||||
|
logger.error(traceback.format_exc())
|
||||||
|
return
|
||||||
|
|
||||||
|
logger.info("\n步骤4: 开始LLM评估")
|
||||||
|
llm_results = []
|
||||||
|
for i, expression in enumerate(expressions, 1):
|
||||||
|
logger.info(f"LLM评估进度: {i}/{len(expressions)}")
|
||||||
|
llm_results.append(await evaluate_expression_llm(expression, llm))
|
||||||
|
await asyncio.sleep(0.3)
|
||||||
|
|
||||||
|
# 4. 对比分析并输出结果
|
||||||
|
comparison = compare_evaluations(manual_results, llm_results, "LLM评估")
|
||||||
|
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("评估结果(以人工评估为标准)")
|
||||||
|
print("=" * 60)
|
||||||
|
print("\n评估目标:")
|
||||||
|
print(" 1. 核心能力:将不合适的项目正确提取出来(特定负类召回率)")
|
||||||
|
print(" 2. 次要能力:尽可能少的误删合适的项目(召回率)")
|
||||||
|
|
||||||
|
# 详细评估结果(核心指标优先)
|
||||||
|
print("\n【详细对比】")
|
||||||
|
print(f"\n--- {comparison['method']} ---")
|
||||||
|
print(f" 总数: {comparison['total']} 条")
|
||||||
|
print()
|
||||||
|
print(" 【核心能力指标】")
|
||||||
|
print(f" ⭐ 特定负类召回率: {comparison['specificity']:.2f}% (将不合适项目正确提取出来的能力)")
|
||||||
|
print(f" - 计算: TN / (TN + FP) = {comparison['true_negatives']} / ({comparison['true_negatives']} + {comparison['false_positives']})")
|
||||||
|
print(f" - 含义: 在 {comparison['true_negatives'] + comparison['false_positives']} 个实际不合适的项目中,正确识别出 {comparison['true_negatives']} 个")
|
||||||
|
print(f" - 随机水平: 50.00% (当前高于随机: {comparison['specificity'] - 50.0:+.2f}%)")
|
||||||
|
print()
|
||||||
|
print(f" ⭐ 召回率: {comparison['recall']:.2f}% (尽可能少的误删合适项目的能力)")
|
||||||
|
print(f" - 计算: TP / (TP + FN) = {comparison['true_positives']} / ({comparison['true_positives']} + {comparison['false_negatives']})")
|
||||||
|
print(f" - 含义: 在 {comparison['true_positives'] + comparison['false_negatives']} 个实际合适的项目中,正确识别出 {comparison['true_positives']} 个")
|
||||||
|
print(f" - 随机水平: 50.00% (当前高于随机: {comparison['recall'] - 50.0:+.2f}%)")
|
||||||
|
print()
|
||||||
|
print(" 【其他指标】")
|
||||||
|
print(f" 准确率: {comparison['accuracy']:.2f}% (整体判断正确率)")
|
||||||
|
print(f" 精确率: {comparison['precision']:.2f}% (判断为合适的项目中,实际合适的比例)")
|
||||||
|
print(f" F1分数: {comparison['f1_score']:.2f} (精确率和召回率的调和平均)")
|
||||||
|
print(f" 匹配数: {comparison['matched']}/{comparison['total']}")
|
||||||
|
print()
|
||||||
|
print(" 【分类统计】")
|
||||||
|
print(f" TP (正确识别为合适): {comparison['true_positives']}")
|
||||||
|
print(f" TN (正确识别为不合适): {comparison['true_negatives']} ⭐")
|
||||||
|
print(f" FP (误判为合适): {comparison['false_positives']} ⚠️")
|
||||||
|
print(f" FN (误删合适项目): {comparison['false_negatives']} ⚠️")
|
||||||
|
|
||||||
|
# 5. 输出人工评估不通过但LLM误判为通过的详细信息
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("人工评估不通过但LLM误判为通过的项目(FP - False Positive)")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
# 按expression_id建立映射
|
||||||
|
llm_dict = {r["expression_id"]: r for r in llm_results}
|
||||||
|
|
||||||
|
fp_items = []
|
||||||
|
for manual_result in manual_results:
|
||||||
|
llm_result = llm_dict.get(manual_result["expression_id"])
|
||||||
|
if llm_result is None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 人工评估不通过,但LLM评估通过(FP情况)
|
||||||
|
if not manual_result["suitable"] and llm_result["suitable"]:
|
||||||
|
fp_items.append({
|
||||||
|
"expression_id": manual_result["expression_id"],
|
||||||
|
"situation": manual_result["situation"],
|
||||||
|
"style": manual_result["style"],
|
||||||
|
"manual_suitable": manual_result["suitable"],
|
||||||
|
"llm_suitable": llm_result["suitable"],
|
||||||
|
"llm_reason": llm_result.get("reason", "未提供理由"),
|
||||||
|
"llm_error": llm_result.get("error")
|
||||||
|
})
|
||||||
|
|
||||||
|
if fp_items:
|
||||||
|
print(f"\n共找到 {len(fp_items)} 条误判项目:\n")
|
||||||
|
for idx, item in enumerate(fp_items, 1):
|
||||||
|
print(f"--- [{idx}] 项目 ID: {item['expression_id']} ---")
|
||||||
|
print(f"Situation: {item['situation']}")
|
||||||
|
print(f"Style: {item['style']}")
|
||||||
|
print("人工评估: 不通过 ❌")
|
||||||
|
print("LLM评估: 通过 ✅ (误判)")
|
||||||
|
if item.get('llm_error'):
|
||||||
|
print(f"LLM错误: {item['llm_error']}")
|
||||||
|
print(f"LLM理由: {item['llm_reason']}")
|
||||||
|
print()
|
||||||
|
else:
|
||||||
|
print("\n✓ 没有误判项目(所有人工评估不通过的项目都被LLM正确识别为不通过)")
|
||||||
|
|
||||||
|
# 6. 保存结果到JSON文件
|
||||||
|
output_file = os.path.join(project_root, "data", "expression_evaluation_comparison.json")
|
||||||
|
try:
|
||||||
|
os.makedirs(os.path.dirname(output_file), exist_ok=True)
|
||||||
|
with open(output_file, "w", encoding="utf-8") as f:
|
||||||
|
json.dump({
|
||||||
|
"manual_results": manual_results,
|
||||||
|
"llm_results": llm_results,
|
||||||
|
"comparison": comparison
|
||||||
|
}, f, ensure_ascii=False, indent=2)
|
||||||
|
logger.info(f"\n评估结果已保存到: {output_file}")
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"保存结果到文件失败: {e}")
|
||||||
|
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("评估完成")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
# 关闭数据库连接
|
||||||
|
try:
|
||||||
|
db.close()
|
||||||
|
logger.info("数据库连接已关闭")
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"关闭数据库连接时出错: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(main())
|
||||||
|
|
||||||
|
|
@ -521,6 +521,7 @@ async def _react_agent_solve_question(
|
||||||
logger.warning(f"{react_log_prefix}第 {iteration + 1} 次迭代 无工具调用且无响应")
|
logger.warning(f"{react_log_prefix}第 {iteration + 1} 次迭代 无工具调用且无响应")
|
||||||
step["observations"] = ["无响应且无工具调用"]
|
step["observations"] = ["无响应且无工具调用"]
|
||||||
thinking_steps.append(step)
|
thinking_steps.append(step)
|
||||||
|
iteration += 1 # 在continue之前增加迭代计数,避免跳过iteration += 1
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# 处理工具调用
|
# 处理工具调用
|
||||||
|
|
@ -1021,6 +1022,11 @@ async def _process_single_question(
|
||||||
Returns:
|
Returns:
|
||||||
Optional[str]: 如果找到答案,返回格式化的结果字符串,否则返回None
|
Optional[str]: 如果找到答案,返回格式化的结果字符串,否则返回None
|
||||||
"""
|
"""
|
||||||
|
# 如果question为空或None,直接返回None,不进行查询
|
||||||
|
if not question or not question.strip():
|
||||||
|
logger.debug("问题为空,跳过查询")
|
||||||
|
return None
|
||||||
|
|
||||||
# logger.info(f"开始处理问题: {question}")
|
# logger.info(f"开始处理问题: {question}")
|
||||||
|
|
||||||
_cleanup_stale_not_found_thinking_back()
|
_cleanup_stale_not_found_thinking_back()
|
||||||
|
|
@ -1116,15 +1122,14 @@ async def build_memory_retrieval_prompt(
|
||||||
recent_query_history = "最近没有查询记录。"
|
recent_query_history = "最近没有查询记录。"
|
||||||
|
|
||||||
# 第一步:生成问题或使用 Planner 提供的问题
|
# 第一步:生成问题或使用 Planner 提供的问题
|
||||||
questions = []
|
single_question: Optional[str] = None
|
||||||
|
|
||||||
# 如果 planner_question 配置开启,只使用 Planner 提供的问题,不使用旧模式
|
# 如果 planner_question 配置开启,只使用 Planner 提供的问题,不使用旧模式
|
||||||
if global_config.memory.planner_question:
|
if global_config.memory.planner_question:
|
||||||
if question and isinstance(question, str) and question.strip():
|
if question and isinstance(question, str) and question.strip():
|
||||||
# 清理和验证 question
|
# 清理和验证 question
|
||||||
cleaned_question = question.strip()
|
single_question = question.strip()
|
||||||
questions = [cleaned_question]
|
logger.info(f"{log_prefix}使用 Planner 提供的 question: {single_question}")
|
||||||
logger.info(f"{log_prefix}使用 Planner 提供的 question: {cleaned_question}")
|
|
||||||
else:
|
else:
|
||||||
# planner_question 开启但没有提供 question,跳过记忆检索
|
# planner_question 开启但没有提供 question,跳过记忆检索
|
||||||
logger.debug(f"{log_prefix}planner_question 已开启但未提供 question,跳过记忆检索")
|
logger.debug(f"{log_prefix}planner_question 已开启但未提供 question,跳过记忆检索")
|
||||||
|
|
@ -1157,10 +1162,11 @@ async def build_memory_retrieval_prompt(
|
||||||
logger.error(f"{log_prefix}LLM生成问题失败: {response}")
|
logger.error(f"{log_prefix}LLM生成问题失败: {response}")
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
# 解析概念列表和问题列表
|
# 解析概念列表和问题列表,只取第一个问题
|
||||||
_, questions = parse_questions_json(response)
|
_, questions = parse_questions_json(response)
|
||||||
if questions:
|
if questions and len(questions) > 0:
|
||||||
logger.info(f"{log_prefix}解析到 {len(questions)} 个问题: {questions}")
|
single_question = questions[0].strip()
|
||||||
|
logger.info(f"{log_prefix}解析到问题: {single_question}")
|
||||||
|
|
||||||
# 初始阶段:使用 Planner 提供的 unknown_words 进行检索(如果提供)
|
# 初始阶段:使用 Planner 提供的 unknown_words 进行检索(如果提供)
|
||||||
initial_info = ""
|
initial_info = ""
|
||||||
|
|
@ -1183,13 +1189,13 @@ async def build_memory_retrieval_prompt(
|
||||||
else:
|
else:
|
||||||
logger.debug(f"{log_prefix}unknown_words 检索未找到任何结果")
|
logger.debug(f"{log_prefix}unknown_words 检索未找到任何结果")
|
||||||
|
|
||||||
if not questions:
|
if not single_question:
|
||||||
logger.debug(f"{log_prefix}模型认为不需要检索记忆或解析失败,不返回任何查询结果")
|
logger.debug(f"{log_prefix}模型认为不需要检索记忆或解析失败,不返回任何查询结果")
|
||||||
end_time = time.time()
|
end_time = time.time()
|
||||||
logger.info(f"{log_prefix}无当次查询,不返回任何结果,耗时: {(end_time - start_time):.3f}秒")
|
logger.info(f"{log_prefix}无当次查询,不返回任何结果,耗时: {(end_time - start_time):.3f}秒")
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
# 第二步:并行处理所有问题(使用配置的最大迭代次数和超时时间)
|
# 第二步:处理问题(使用配置的最大迭代次数和超时时间)
|
||||||
base_max_iterations = global_config.memory.max_agent_iterations
|
base_max_iterations = global_config.memory.max_agent_iterations
|
||||||
# 根据think_level调整迭代次数:think_level=1时不变,think_level=0时减半
|
# 根据think_level调整迭代次数:think_level=1时不变,think_level=0时减半
|
||||||
if think_level == 0:
|
if think_level == 0:
|
||||||
|
|
@ -1198,31 +1204,21 @@ async def build_memory_retrieval_prompt(
|
||||||
max_iterations = base_max_iterations
|
max_iterations = base_max_iterations
|
||||||
timeout_seconds = global_config.memory.agent_timeout_seconds
|
timeout_seconds = global_config.memory.agent_timeout_seconds
|
||||||
logger.debug(
|
logger.debug(
|
||||||
f"{log_prefix}问题数量: {len(questions)},think_level={think_level},设置最大迭代次数: {max_iterations}(基础值: {base_max_iterations}),超时时间: {timeout_seconds}秒"
|
f"{log_prefix}问题: {single_question},think_level={think_level},设置最大迭代次数: {max_iterations}(基础值: {base_max_iterations}),超时时间: {timeout_seconds}秒"
|
||||||
)
|
)
|
||||||
|
|
||||||
# 并行处理所有问题
|
# 处理单个问题
|
||||||
question_tasks = [
|
try:
|
||||||
_process_single_question(
|
result = await _process_single_question(
|
||||||
question=question,
|
question=single_question,
|
||||||
chat_id=chat_id,
|
chat_id=chat_id,
|
||||||
context=message,
|
context=message,
|
||||||
initial_info=initial_info,
|
initial_info=initial_info,
|
||||||
max_iterations=max_iterations,
|
max_iterations=max_iterations,
|
||||||
)
|
)
|
||||||
for question in questions
|
except Exception as e:
|
||||||
]
|
logger.error(f"{log_prefix}处理问题 '{single_question}' 时发生异常: {e}")
|
||||||
|
result = None
|
||||||
# 并行执行所有查询任务
|
|
||||||
results = await asyncio.gather(*question_tasks, return_exceptions=True)
|
|
||||||
|
|
||||||
# 收集所有有效结果
|
|
||||||
question_results: List[str] = []
|
|
||||||
for i, result in enumerate(results):
|
|
||||||
if isinstance(result, Exception):
|
|
||||||
logger.error(f"{log_prefix}处理问题 '{questions[i]}' 时发生异常: {result}")
|
|
||||||
elif result is not None:
|
|
||||||
question_results.append(result)
|
|
||||||
|
|
||||||
# 获取最近10分钟内已找到答案的缓存记录
|
# 获取最近10分钟内已找到答案的缓存记录
|
||||||
cached_answers = _get_recent_found_answers(chat_id, time_window_seconds=600.0)
|
cached_answers = _get_recent_found_answers(chat_id, time_window_seconds=600.0)
|
||||||
|
|
@ -1231,29 +1227,29 @@ async def build_memory_retrieval_prompt(
|
||||||
all_results = []
|
all_results = []
|
||||||
|
|
||||||
# 先添加当前查询的结果
|
# 先添加当前查询的结果
|
||||||
current_questions = set()
|
current_question = None
|
||||||
for result in question_results:
|
if result:
|
||||||
|
all_results.append(result)
|
||||||
# 提取问题(格式为 "问题:xxx\n答案:xxx")
|
# 提取问题(格式为 "问题:xxx\n答案:xxx")
|
||||||
if result.startswith("问题:"):
|
if result.startswith("问题:"):
|
||||||
question_end = result.find("\n答案:")
|
question_end = result.find("\n答案:")
|
||||||
if question_end != -1:
|
if question_end != -1:
|
||||||
current_questions.add(result[4:question_end])
|
current_question = result[4:question_end]
|
||||||
all_results.append(result)
|
|
||||||
|
|
||||||
# 添加缓存答案(排除当前查询中已存在的问题)
|
# 添加缓存答案(排除当前查询的问题)
|
||||||
for cached_answer in cached_answers:
|
for cached_answer in cached_answers:
|
||||||
if cached_answer.startswith("问题:"):
|
if cached_answer.startswith("问题:"):
|
||||||
question_end = cached_answer.find("\n答案:")
|
question_end = cached_answer.find("\n答案:")
|
||||||
if question_end != -1:
|
if question_end != -1:
|
||||||
cached_question = cached_answer[4:question_end]
|
cached_question = cached_answer[4:question_end]
|
||||||
if cached_question not in current_questions:
|
if cached_question != current_question:
|
||||||
all_results.append(cached_answer)
|
all_results.append(cached_answer)
|
||||||
|
|
||||||
end_time = time.time()
|
end_time = time.time()
|
||||||
|
|
||||||
if all_results:
|
if all_results:
|
||||||
retrieved_memory = "\n\n".join(all_results)
|
retrieved_memory = "\n\n".join(all_results)
|
||||||
current_count = len(question_results)
|
current_count = 1 if result else 0
|
||||||
cached_count = len(all_results) - current_count
|
cached_count = len(all_results) - current_count
|
||||||
logger.info(
|
logger.info(
|
||||||
f"{log_prefix}记忆检索成功,耗时: {(end_time - start_time):.3f}秒,"
|
f"{log_prefix}记忆检索成功,耗时: {(end_time - start_time):.3f}秒,"
|
||||||
|
|
@ -1261,7 +1257,7 @@ async def build_memory_retrieval_prompt(
|
||||||
)
|
)
|
||||||
return f"你回忆起了以下信息:\n{retrieved_memory}\n如果与回复内容相关,可以参考这些回忆的信息。\n"
|
return f"你回忆起了以下信息:\n{retrieved_memory}\n如果与回复内容相关,可以参考这些回忆的信息。\n"
|
||||||
else:
|
else:
|
||||||
logger.debug(f"{log_prefix}所有问题均未找到答案,且无缓存答案")
|
logger.debug(f"{log_prefix}问题未找到答案,且无缓存答案")
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|
|
||||||
|
|
@ -141,13 +141,13 @@ temperature = 0.2 # 模型温度,新V3建议0.1-0.3
|
||||||
max_tokens = 4096 # 最大输出token数
|
max_tokens = 4096 # 最大输出token数
|
||||||
slow_threshold = 15.0 # 慢请求阈值(秒),模型等待回复时间超过此值会输出警告日志
|
slow_threshold = 15.0 # 慢请求阈值(秒),模型等待回复时间超过此值会输出警告日志
|
||||||
|
|
||||||
[model_task_config.tool_use] #工具调用模型,需要使用支持工具调用的模型
|
[model_task_config.tool_use] #功能模型,需要使用支持工具调用的模型,请使用较快的小模型(调用量较大)
|
||||||
model_list = ["qwen3-30b","qwen3-next-80b"]
|
model_list = ["qwen3-30b","qwen3-next-80b"]
|
||||||
temperature = 0.7
|
temperature = 0.7
|
||||||
max_tokens = 800
|
max_tokens = 1024
|
||||||
slow_threshold = 10.0
|
slow_threshold = 10.0
|
||||||
|
|
||||||
[model_task_config.replyer] # 首要回复模型,还用于表达器和表达方式学习
|
[model_task_config.replyer] # 首要回复模型,还用于表达方式学习
|
||||||
model_list = ["siliconflow-deepseek-v3.2","siliconflow-deepseek-v3.2-think","siliconflow-glm-4.6","siliconflow-glm-4.6-think"]
|
model_list = ["siliconflow-deepseek-v3.2","siliconflow-deepseek-v3.2-think","siliconflow-glm-4.6","siliconflow-glm-4.6-think"]
|
||||||
temperature = 0.3 # 模型温度,新V3建议0.1-0.3
|
temperature = 0.3 # 模型温度,新V3建议0.1-0.3
|
||||||
max_tokens = 2048
|
max_tokens = 2048
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue