diff --git a/scripts/evaluate_expressions.py b/scripts/evaluate_expressions.py deleted file mode 100644 index 5f413423..00000000 --- a/scripts/evaluate_expressions.py +++ /dev/null @@ -1,295 +0,0 @@ -""" -表达方式评估脚本 - -功能: -1. 随机读取10条表达方式,获取其situation和style -2. 使用LLM对表达方式进行评估(每个表达方式单独评估) -3. 如果合适,就通过,如果不合适,就丢弃 -4. 不真正修改数据库,只是做评估 -""" - -import asyncio -import random -import json -import sys -import os - -# 添加项目根目录到路径 -project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) -sys.path.insert(0, project_root) - -from src.common.database.database_model import Expression -from src.common.database.database import db -from src.llm_models.utils_model import LLMRequest -from src.config.config import model_config -from src.common.logger import get_logger - -logger = get_logger("expression_evaluator") - - -def get_random_expressions(count: int = 10) -> list[Expression]: - """ - 随机读取指定数量的表达方式 - - Args: - count: 要读取的数量,默认10条 - - Returns: - 表达方式列表 - """ - try: - # 查询所有表达方式 - all_expressions = list(Expression.select()) - - if not all_expressions: - logger.warning("数据库中没有表达方式记录") - return [] - - # 如果总数少于请求数量,返回所有 - if len(all_expressions) <= count: - logger.info(f"数据库中共有 {len(all_expressions)} 条表达方式,全部返回") - return all_expressions - - # 随机选择指定数量 - selected = random.sample(all_expressions, count) - logger.info(f"从 {len(all_expressions)} 条表达方式中随机选择了 {len(selected)} 条") - return selected - - except Exception as e: - logger.error(f"随机读取表达方式失败: {e}") - import traceback - logger.error(traceback.format_exc()) - return [] - - -def create_evaluation_prompt(situation: str, style: str) -> str: - """ - 创建评估提示词 - - Args: - situation: 情境 - style: 风格 - - Returns: - 评估提示词 - """ - prompt = f"""请评估以下表达方式是否合适: - -情境(situation):{situation} -风格(style):{style} - -请从以下方面进行评估: -1. 情境描述是否清晰、准确 -2. 风格表达是否合理、自然 -3. 情境和风格是否匹配 -4. 是否存在不当内容或表达 - -请以JSON格式输出评估结果: -{{ - "suitable": true/false, - "reason": "评估理由(如果不合适,请说明原因)" -}} - -如果合适,suitable设为true;如果不合适,suitable设为false,并在reason中说明原因。 -请严格按照JSON格式输出,不要包含其他内容。""" - - return prompt - - -async def evaluate_expression(expression: Expression, llm: LLMRequest) -> dict: - """ - 使用LLM评估单个表达方式 - - Args: - expression: 表达方式对象 - llm: LLM请求实例 - - Returns: - 评估结果字典,包含: - - expression_id: 表达方式ID - - situation: 情境 - - style: 风格 - - suitable: 是否合适 - - reason: 评估理由 - - error: 错误信息(如果有) - """ - result = { - "expression_id": expression.id, - "situation": expression.situation, - "style": expression.style, - "suitable": None, - "reason": None, - "error": None - } - - try: - # 创建评估提示词 - prompt = create_evaluation_prompt(expression.situation, expression.style) - - # 调用LLM进行评估 - logger.info(f"正在评估表达方式 ID: {expression.id}, Situation: {expression.situation}, Style: {expression.style}") - response, (reasoning, model_name, _) = await llm.generate_response_async( - prompt=prompt, - temperature=0.3, - max_tokens=500 - ) - - logger.debug(f"LLM响应: {response}") - logger.debug(f"使用模型: {model_name}") - - # 解析JSON响应 - try: - # 尝试直接解析 - evaluation = json.loads(response) - except json.JSONDecodeError: - # 如果直接解析失败,尝试提取JSON部分 - import re - json_match = re.search(r'\{[^{}]*"suitable"[^{}]*\}', response, re.DOTALL) - if json_match: - evaluation = json.loads(json_match.group()) - else: - raise ValueError("无法从响应中提取JSON格式的评估结果") - - # 提取评估结果 - result["suitable"] = evaluation.get("suitable", False) - result["reason"] = evaluation.get("reason", "未提供理由") - - logger.info(f"表达方式 ID: {expression.id} 评估结果: {'通过' if result['suitable'] else '不通过'}") - if result["reason"]: - logger.info(f"评估理由: {result['reason']}") - - except Exception as e: - logger.error(f"评估表达方式 ID: {expression.id} 时出错: {e}") - import traceback - logger.error(traceback.format_exc()) - result["error"] = str(e) - result["suitable"] = False - result["reason"] = f"评估过程出错: {str(e)}" - - return result - - -async def main(): - """主函数""" - logger.info("=" * 60) - logger.info("开始表达方式评估") - logger.info("=" * 60) - - # 初始化数据库连接 - try: - db.connect(reuse_if_open=True) - logger.info("数据库连接成功") - except Exception as e: - logger.error(f"数据库连接失败: {e}") - return - - # 1. 随机读取10条表达方式 - logger.info("\n步骤1: 随机读取10条表达方式") - expressions = get_random_expressions(10) - - if not expressions: - logger.error("没有可用的表达方式,退出") - return - - logger.info(f"成功读取 {len(expressions)} 条表达方式") - for i, expr in enumerate(expressions, 1): - logger.info(f" {i}. ID: {expr.id}, Situation: {expr.situation}, Style: {expr.style}") - - # 2. 创建LLM实例 - logger.info("\n步骤2: 创建LLM实例") - try: - llm = LLMRequest( - model_set=model_config.model_task_config.tool_use, - request_type="expression_evaluator" - ) - logger.info("LLM实例创建成功") - except Exception as e: - logger.error(f"创建LLM实例失败: {e}") - import traceback - logger.error(traceback.format_exc()) - return - - # 3. 对每个表达方式进行评估 - logger.info("\n步骤3: 开始评估表达方式") - results = [] - - for i, expression in enumerate(expressions, 1): - logger.info(f"\n--- 评估进度: {i}/{len(expressions)} ---") - result = await evaluate_expression(expression, llm) - results.append(result) - - # 添加短暂延迟,避免请求过快 - if i < len(expressions): - await asyncio.sleep(0.5) - - # 4. 汇总结果 - logger.info("\n" + "=" * 60) - logger.info("评估结果汇总") - logger.info("=" * 60) - - passed = [r for r in results if r["suitable"] is True] - failed = [r for r in results if r["suitable"] is False] - errors = [r for r in results if r["error"] is not None] - - logger.info(f"\n总计: {len(results)} 条") - logger.info(f"通过: {len(passed)} 条") - logger.info(f"不通过: {len(failed)} 条") - if errors: - logger.info(f"出错: {len(errors)} 条") - - # 详细结果 - logger.info("\n--- 通过的表达方式 ---") - if passed: - for r in passed: - logger.info(f" ID: {r['expression_id']}") - logger.info(f" Situation: {r['situation']}") - logger.info(f" Style: {r['style']}") - if r['reason']: - logger.info(f" 理由: {r['reason']}") - else: - logger.info(" 无") - - logger.info("\n--- 不通过的表达方式 ---") - if failed: - for r in failed: - logger.info(f" ID: {r['expression_id']}") - logger.info(f" Situation: {r['situation']}") - logger.info(f" Style: {r['style']}") - if r['reason']: - logger.info(f" 理由: {r['reason']}") - if r['error']: - logger.info(f" 错误: {r['error']}") - else: - logger.info(" 无") - - # 保存结果到JSON文件(可选) - output_file = os.path.join(project_root, "data", "expression_evaluation_results.json") - try: - os.makedirs(os.path.dirname(output_file), exist_ok=True) - with open(output_file, "w", encoding="utf-8") as f: - json.dump({ - "total": len(results), - "passed": len(passed), - "failed": len(failed), - "errors": len(errors), - "results": results - }, f, ensure_ascii=False, indent=2) - logger.info(f"\n评估结果已保存到: {output_file}") - except Exception as e: - logger.warning(f"保存结果到文件失败: {e}") - - logger.info("\n" + "=" * 60) - logger.info("评估完成") - logger.info("=" * 60) - - # 关闭数据库连接 - try: - db.close() - logger.info("数据库连接已关闭") - except Exception as e: - logger.warning(f"关闭数据库连接时出错: {e}") - - -if __name__ == "__main__": - asyncio.run(main()) - diff --git a/scripts/evaluate_expressions_llm_v6.py b/scripts/evaluate_expressions_llm_v6.py new file mode 100644 index 00000000..0a696e48 --- /dev/null +++ b/scripts/evaluate_expressions_llm_v6.py @@ -0,0 +1,488 @@ +""" +表达方式LLM评估脚本 + +功能: +1. 读取已保存的人工评估结果(作为效标) +2. 使用LLM对相同项目进行评估 +3. 对比人工评估和LLM评估的结果,输出分析报告 +""" + +import asyncio +import argparse +import json +import random +import sys +import os +from typing import List, Dict + +# 添加项目根目录到路径 +project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) +sys.path.insert(0, project_root) + +from src.llm_models.utils_model import LLMRequest +from src.config.config import model_config +from src.common.logger import get_logger + +logger = get_logger("expression_evaluator_llm") + +# 评估结果文件路径 +TEMP_DIR = os.path.join(os.path.dirname(__file__), "temp") +MANUAL_EVAL_FILE = os.path.join(TEMP_DIR, "manual_evaluation_results.json") + + +def load_manual_results() -> List[Dict]: + """ + 加载人工评估结果 + + Returns: + 人工评估结果列表 + """ + if not os.path.exists(MANUAL_EVAL_FILE): + logger.error(f"未找到人工评估结果文件: {MANUAL_EVAL_FILE}") + print("\n✗ 错误:未找到人工评估结果文件") + print(" 请先运行 evaluate_expressions_manual.py 进行人工评估") + return [] + + try: + with open(MANUAL_EVAL_FILE, "r", encoding="utf-8") as f: + data = json.load(f) + results = data.get("manual_results", []) + logger.info(f"成功加载 {len(results)} 条人工评估结果") + return results + except Exception as e: + logger.error(f"加载人工评估结果失败: {e}") + print(f"\n✗ 加载人工评估结果失败: {e}") + return [] + + +def create_evaluation_prompt(situation: str, style: str) -> str: + """ + 创建评估提示词 + + Args: + situation: 情境 + style: 风格 + + Returns: + 评估提示词 + """ + prompt = f"""请评估以下表达方式或语言风格以及使用条件或使用情景是否合适: +使用条件或使用情景:{situation} +表达方式或言语风格:{style} + +请从以下方面进行评估: +1. 表达方式或言语风格 是否与使用条件或使用情景 匹配 +2. 允许部分语法错误或口头化或缺省出现 +3. 表达方式不能太过特指,需要具有泛用性 +4. 一般不涉及具体的人名或名称 + +请以JSON格式输出评估结果: +{{ + "suitable": true/false, + "reason": "评估理由(如果不合适,请说明原因)" + +}} +如果合适,suitable设为true;如果不合适,suitable设为false,并在reason中说明原因。 +请严格按照JSON格式输出,不要包含其他内容。""" + + return prompt + + +async def _single_llm_evaluation(situation: str, style: str, llm: LLMRequest) -> tuple[bool, str, str | None]: + """ + 执行单次LLM评估 + + Args: + situation: 情境 + style: 风格 + llm: LLM请求实例 + + Returns: + (suitable, reason, error) 元组,如果出错则 suitable 为 False,error 包含错误信息 + """ + try: + prompt = create_evaluation_prompt(situation, style) + logger.debug(f"正在评估表达方式: situation={situation}, style={style}") + + response, (reasoning, model_name, _) = await llm.generate_response_async( + prompt=prompt, + temperature=0.6, + max_tokens=1024 + ) + + logger.debug(f"LLM响应: {response}") + + # 解析JSON响应 + try: + evaluation = json.loads(response) + except json.JSONDecodeError as e: + import re + json_match = re.search(r'\{[^{}]*"suitable"[^{}]*\}', response, re.DOTALL) + if json_match: + evaluation = json.loads(json_match.group()) + else: + raise ValueError("无法从响应中提取JSON格式的评估结果") from e + + suitable = evaluation.get("suitable", False) + reason = evaluation.get("reason", "未提供理由") + + logger.debug(f"评估结果: {'通过' if suitable else '不通过'}") + return suitable, reason, None + + except Exception as e: + logger.error(f"评估表达方式 (situation={situation}, style={style}) 时出错: {e}") + return False, f"评估过程出错: {str(e)}", str(e) + + +async def evaluate_expression_llm(situation: str, style: str, llm: LLMRequest) -> Dict: + """ + 使用LLM评估单个表达方式 + + Args: + situation: 情境 + style: 风格 + llm: LLM请求实例 + + Returns: + 评估结果字典 + """ + logger.info(f"开始评估表达方式: situation={situation}, style={style}") + + suitable, reason, error = await _single_llm_evaluation(situation, style, llm) + + if error: + suitable = False + + logger.info(f"评估完成: {'通过' if suitable else '不通过'}") + + return { + "situation": situation, + "style": style, + "suitable": suitable, + "reason": reason, + "error": error, + "evaluator": "llm" + } + + +def compare_evaluations(manual_results: List[Dict], llm_results: List[Dict], method_name: str) -> Dict: + """ + 对比人工评估和LLM评估的结果 + + Args: + manual_results: 人工评估结果列表 + llm_results: LLM评估结果列表 + method_name: 评估方法名称(用于标识) + + Returns: + 对比分析结果字典 + """ + # 按(situation, style)建立映射 + llm_dict = {(r["situation"], r["style"]): r for r in llm_results} + + total = len(manual_results) + matched = 0 + true_positives = 0 + true_negatives = 0 + false_positives = 0 + false_negatives = 0 + + for manual_result in manual_results: + pair = (manual_result["situation"], manual_result["style"]) + llm_result = llm_dict.get(pair) + if llm_result is None: + continue + + manual_suitable = manual_result["suitable"] + llm_suitable = llm_result["suitable"] + + if manual_suitable == llm_suitable: + matched += 1 + + if manual_suitable and llm_suitable: + true_positives += 1 + elif not manual_suitable and not llm_suitable: + true_negatives += 1 + elif not manual_suitable and llm_suitable: + false_positives += 1 + elif manual_suitable and not llm_suitable: + false_negatives += 1 + + accuracy = (matched / total * 100) if total > 0 else 0 + precision = (true_positives / (true_positives + false_positives) * 100) if (true_positives + false_positives) > 0 else 0 + recall = (true_positives / (true_positives + false_negatives) * 100) if (true_positives + false_negatives) > 0 else 0 + f1_score = (2 * precision * recall / (precision + recall)) if (precision + recall) > 0 else 0 + specificity = (true_negatives / (true_negatives + false_positives) * 100) if (true_negatives + false_positives) > 0 else 0 + + # 计算人工效标的不合适率 + manual_unsuitable_count = true_negatives + false_positives # 人工评估不合适的总数 + manual_unsuitable_rate = (manual_unsuitable_count / total * 100) if total > 0 else 0 + + # 计算经过LLM删除后剩余项目中的不合适率 + # 在所有项目中,移除LLM判定为不合适的项目后,剩下的项目 = TP + FP(LLM判定为合适的项目) + # 在这些剩下的项目中,按人工评定的不合适项目 = FP(人工认为不合适,但LLM认为合适) + llm_kept_count = true_positives + false_positives # LLM判定为合适的项目总数(保留的项目) + llm_kept_unsuitable_rate = (false_positives / llm_kept_count * 100) if llm_kept_count > 0 else 0 + + # 两者百分比相减(评估LLM评定修正后的不合适率是否有降低) + rate_difference = manual_unsuitable_rate - llm_kept_unsuitable_rate + + random_baseline = 50.0 + accuracy_above_random = accuracy - random_baseline + accuracy_improvement_ratio = (accuracy / random_baseline) if random_baseline > 0 else 0 + + return { + "method": method_name, + "total": total, + "matched": matched, + "accuracy": accuracy, + "accuracy_above_random": accuracy_above_random, + "accuracy_improvement_ratio": accuracy_improvement_ratio, + "true_positives": true_positives, + "true_negatives": true_negatives, + "false_positives": false_positives, + "false_negatives": false_negatives, + "precision": precision, + "recall": recall, + "f1_score": f1_score, + "specificity": specificity, + "manual_unsuitable_rate": manual_unsuitable_rate, + "llm_kept_unsuitable_rate": llm_kept_unsuitable_rate, + "rate_difference": rate_difference + } + + +async def main(count: int | None = None): + """ + 主函数 + + Args: + count: 随机选取的数据条数,如果为None则使用全部数据 + """ + logger.info("=" * 60) + logger.info("开始表达方式LLM评估") + logger.info("=" * 60) + + # 1. 加载人工评估结果 + print("\n步骤1: 加载人工评估结果") + manual_results = load_manual_results() + if not manual_results: + return + + print(f"成功加载 {len(manual_results)} 条人工评估结果") + + # 如果指定了数量,随机选择指定数量的数据 + if count is not None: + if count <= 0: + print(f"\n✗ 错误:指定的数量必须大于0,当前值: {count}") + return + if count > len(manual_results): + print(f"\n⚠ 警告:指定的数量 ({count}) 大于可用数据量 ({len(manual_results)}),将使用全部数据") + else: + random.seed() # 使用系统时间作为随机种子 + manual_results = random.sample(manual_results, count) + print(f"随机选取 {len(manual_results)} 条数据进行评估") + + # 验证数据完整性 + valid_manual_results = [] + for r in manual_results: + if "situation" in r and "style" in r: + valid_manual_results.append(r) + else: + logger.warning(f"跳过无效数据: {r}") + + if len(valid_manual_results) != len(manual_results): + print(f"警告:{len(manual_results) - len(valid_manual_results)} 条数据缺少必要字段,已跳过") + + print(f"有效数据: {len(valid_manual_results)} 条") + + # 2. 创建LLM实例并评估 + print("\n步骤2: 创建LLM实例") + try: + llm = LLMRequest( + model_set=model_config.model_task_config.tool_use, + request_type="expression_evaluator_llm" + ) + except Exception as e: + logger.error(f"创建LLM实例失败: {e}") + import traceback + logger.error(traceback.format_exc()) + return + + print("\n步骤3: 开始LLM评估") + llm_results = [] + for i, manual_result in enumerate(valid_manual_results, 1): + print(f"LLM评估进度: {i}/{len(valid_manual_results)}") + llm_results.append(await evaluate_expression_llm( + manual_result["situation"], + manual_result["style"], + llm + )) + await asyncio.sleep(0.3) + + # 5. 输出FP和FN项目(在评估结果之前) + llm_dict = {(r["situation"], r["style"]): r for r in llm_results} + + # 5.1 输出FP项目(人工评估不通过但LLM误判为通过) + print("\n" + "=" * 60) + print("人工评估不通过但LLM误判为通过的项目(FP - False Positive)") + print("=" * 60) + + fp_items = [] + for manual_result in valid_manual_results: + pair = (manual_result["situation"], manual_result["style"]) + llm_result = llm_dict.get(pair) + if llm_result is None: + continue + + # 人工评估不通过,但LLM评估通过(FP情况) + if not manual_result["suitable"] and llm_result["suitable"]: + fp_items.append({ + "situation": manual_result["situation"], + "style": manual_result["style"], + "manual_suitable": manual_result["suitable"], + "llm_suitable": llm_result["suitable"], + "llm_reason": llm_result.get("reason", "未提供理由"), + "llm_error": llm_result.get("error") + }) + + if fp_items: + print(f"\n共找到 {len(fp_items)} 条误判项目:\n") + for idx, item in enumerate(fp_items, 1): + print(f"--- [{idx}] ---") + print(f"Situation: {item['situation']}") + print(f"Style: {item['style']}") + print("人工评估: 不通过 ❌") + print("LLM评估: 通过 ✅ (误判)") + if item.get('llm_error'): + print(f"LLM错误: {item['llm_error']}") + print(f"LLM理由: {item['llm_reason']}") + print() + else: + print("\n✓ 没有误判项目(所有人工评估不通过的项目都被LLM正确识别为不通过)") + + # 5.2 输出FN项目(人工评估通过但LLM误判为不通过) + print("\n" + "=" * 60) + print("人工评估通过但LLM误判为不通过的项目(FN - False Negative)") + print("=" * 60) + + fn_items = [] + for manual_result in valid_manual_results: + pair = (manual_result["situation"], manual_result["style"]) + llm_result = llm_dict.get(pair) + if llm_result is None: + continue + + # 人工评估通过,但LLM评估不通过(FN情况) + if manual_result["suitable"] and not llm_result["suitable"]: + fn_items.append({ + "situation": manual_result["situation"], + "style": manual_result["style"], + "manual_suitable": manual_result["suitable"], + "llm_suitable": llm_result["suitable"], + "llm_reason": llm_result.get("reason", "未提供理由"), + "llm_error": llm_result.get("error") + }) + + if fn_items: + print(f"\n共找到 {len(fn_items)} 条误删项目:\n") + for idx, item in enumerate(fn_items, 1): + print(f"--- [{idx}] ---") + print(f"Situation: {item['situation']}") + print(f"Style: {item['style']}") + print("人工评估: 通过 ✅") + print("LLM评估: 不通过 ❌ (误删)") + if item.get('llm_error'): + print(f"LLM错误: {item['llm_error']}") + print(f"LLM理由: {item['llm_reason']}") + print() + else: + print("\n✓ 没有误删项目(所有人工评估通过的项目都被LLM正确识别为通过)") + + # 6. 对比分析并输出结果 + comparison = compare_evaluations(valid_manual_results, llm_results, "LLM评估") + + print("\n" + "=" * 60) + print("评估结果(以人工评估为标准)") + print("=" * 60) + + # 详细评估结果(核心指标优先) + print(f"\n--- {comparison['method']} ---") + print(f" 总数: {comparison['total']} 条") + print() + # print(" 【核心能力指标】") + print(f" 特定负类召回率: {comparison['specificity']:.2f}% (将不合适项目正确提取出来的能力)") + print(f" - 计算: TN / (TN + FP) = {comparison['true_negatives']} / ({comparison['true_negatives']} + {comparison['false_positives']})") + print(f" - 含义: 在 {comparison['true_negatives'] + comparison['false_positives']} 个实际不合适的项目中,正确识别出 {comparison['true_negatives']} 个") + # print(f" - 随机水平: 50.00% (当前高于随机: {comparison['specificity'] - 50.0:+.2f}%)") + print() + print(f" 召回率: {comparison['recall']:.2f}% (尽可能少的误删合适项目的能力)") + print(f" - 计算: TP / (TP + FN) = {comparison['true_positives']} / ({comparison['true_positives']} + {comparison['false_negatives']})") + print(f" - 含义: 在 {comparison['true_positives'] + comparison['false_negatives']} 个实际合适的项目中,正确识别出 {comparison['true_positives']} 个") + # print(f" - 随机水平: 50.00% (当前高于随机: {comparison['recall'] - 50.0:+.2f}%)") + print() + print(" 【其他指标】") + print(f" 准确率: {comparison['accuracy']:.2f}% (整体判断正确率)") + print(f" 精确率: {comparison['precision']:.2f}% (判断为合适的项目中,实际合适的比例)") + print(f" F1分数: {comparison['f1_score']:.2f} (精确率和召回率的调和平均)") + print(f" 匹配数: {comparison['matched']}/{comparison['total']}") + print() + print(" 【不合适率分析】") + print(f" 人工效标的不合适率: {comparison['manual_unsuitable_rate']:.2f}%") + print(f" - 计算: (TN + FP) / 总数 = ({comparison['true_negatives']} + {comparison['false_positives']}) / {comparison['total']}") + print(f" - 含义: 在人工评估中,有 {comparison['manual_unsuitable_rate']:.2f}% 的项目被判定为不合适") + print() + print(f" 经过LLM删除后剩余项目中的不合适率: {comparison['llm_kept_unsuitable_rate']:.2f}%") + print(f" - 计算: FP / (TP + FP) = {comparison['false_positives']} / ({comparison['true_positives']} + {comparison['false_positives']})") + print(f" - 含义: 在所有项目中,移除LLM判定为不合适的项目后,在剩下的 {comparison['true_positives'] + comparison['false_positives']} 个项目中,人工认为不合适的项目占 {comparison['llm_kept_unsuitable_rate']:.2f}%") + print() + # print(f" 两者百分比差值: {comparison['rate_difference']:+.2f}%") + # print(f" - 计算: 人工效标不合适率 - LLM删除后剩余项目不合适率 = {comparison['manual_unsuitable_rate']:.2f}% - {comparison['llm_kept_unsuitable_rate']:.2f}%") + # print(f" - 含义: {'LLM删除后剩余项目中的不合适率降低了' if comparison['rate_difference'] > 0 else 'LLM删除后剩余项目中的不合适率反而升高了' if comparison['rate_difference'] < 0 else '两者相等'} ({'✓ LLM删除有效' if comparison['rate_difference'] > 0 else '✗ LLM删除效果不佳' if comparison['rate_difference'] < 0 else '效果相同'})") + # print() + print(" 【分类统计】") + print(f" TP (正确识别为合适): {comparison['true_positives']}") + print(f" TN (正确识别为不合适): {comparison['true_negatives']} ⭐") + print(f" FP (误判为合适): {comparison['false_positives']} ⚠️") + print(f" FN (误删合适项目): {comparison['false_negatives']} ⚠️") + + # 7. 保存结果到JSON文件 + output_file = os.path.join(project_root, "data", "expression_evaluation_llm.json") + try: + os.makedirs(os.path.dirname(output_file), exist_ok=True) + with open(output_file, "w", encoding="utf-8") as f: + json.dump({ + "manual_results": valid_manual_results, + "llm_results": llm_results, + "comparison": comparison + }, f, ensure_ascii=False, indent=2) + logger.info(f"\n评估结果已保存到: {output_file}") + except Exception as e: + logger.warning(f"保存结果到文件失败: {e}") + + print("\n" + "=" * 60) + print("评估完成") + print("=" * 60) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="表达方式LLM评估脚本", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +示例: + python evaluate_expressions_llm_v6.py # 使用全部数据 + python evaluate_expressions_llm_v6.py -n 50 # 随机选取50条数据 + python evaluate_expressions_llm_v6.py --count 100 # 随机选取100条数据 + """ + ) + parser.add_argument( + "-n", "--count", + type=int, + default=None, + help="随机选取的数据条数(默认:使用全部数据)" + ) + + args = parser.parse_args() + asyncio.run(main(count=args.count)) + diff --git a/scripts/evaluate_expressions_manual.py b/scripts/evaluate_expressions_manual.py new file mode 100644 index 00000000..8221112b --- /dev/null +++ b/scripts/evaluate_expressions_manual.py @@ -0,0 +1,278 @@ +""" +表达方式人工评估脚本 + +功能: +1. 不停随机抽取项目(不重复)进行人工评估 +2. 将结果保存到 temp 文件夹下的 JSON 文件,作为效标(标准答案) +3. 支持继续评估(从已有文件中读取已评估的项目,避免重复) +""" + +import random +import json +import sys +import os +from typing import List, Dict, Set, Tuple +from datetime import datetime + +# 添加项目根目录到路径 +project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) +sys.path.insert(0, project_root) + +from src.common.database.database_model import Expression +from src.common.database.database import db +from src.common.logger import get_logger + +logger = get_logger("expression_evaluator_manual") + +# 评估结果文件路径 +TEMP_DIR = os.path.join(os.path.dirname(__file__), "temp") +MANUAL_EVAL_FILE = os.path.join(TEMP_DIR, "manual_evaluation_results.json") + + +def load_existing_results() -> tuple[List[Dict], Set[Tuple[str, str]]]: + """ + 加载已有的评估结果 + + Returns: + (已有结果列表, 已评估的项目(situation, style)元组集合) + """ + if not os.path.exists(MANUAL_EVAL_FILE): + return [], set() + + try: + with open(MANUAL_EVAL_FILE, "r", encoding="utf-8") as f: + data = json.load(f) + results = data.get("manual_results", []) + # 使用 (situation, style) 作为唯一标识 + evaluated_pairs = {(r["situation"], r["style"]) for r in results if "situation" in r and "style" in r} + logger.info(f"已加载 {len(results)} 条已有评估结果") + return results, evaluated_pairs + except Exception as e: + logger.error(f"加载已有评估结果失败: {e}") + return [], set() + + +def save_results(manual_results: List[Dict]): + """ + 保存评估结果到文件 + + Args: + manual_results: 评估结果列表 + """ + try: + os.makedirs(TEMP_DIR, exist_ok=True) + + data = { + "last_updated": datetime.now().isoformat(), + "total_count": len(manual_results), + "manual_results": manual_results + } + + with open(MANUAL_EVAL_FILE, "w", encoding="utf-8") as f: + json.dump(data, f, ensure_ascii=False, indent=2) + + logger.info(f"评估结果已保存到: {MANUAL_EVAL_FILE}") + print(f"\n✓ 评估结果已保存(共 {len(manual_results)} 条)") + except Exception as e: + logger.error(f"保存评估结果失败: {e}") + print(f"\n✗ 保存评估结果失败: {e}") + + +def get_unevaluated_expressions(evaluated_pairs: Set[Tuple[str, str]], batch_size: int = 10) -> List[Expression]: + """ + 获取未评估的表达方式 + + Args: + evaluated_pairs: 已评估的项目(situation, style)元组集合 + batch_size: 每次获取的数量 + + Returns: + 未评估的表达方式列表 + """ + try: + # 查询所有表达方式 + all_expressions = list(Expression.select()) + + if not all_expressions: + logger.warning("数据库中没有表达方式记录") + return [] + + # 过滤出未评估的项目:匹配 situation 和 style 均一致 + unevaluated = [ + expr for expr in all_expressions + if (expr.situation, expr.style) not in evaluated_pairs + ] + + if not unevaluated: + logger.info("所有项目都已评估完成") + return [] + + # 如果未评估数量少于请求数量,返回所有 + if len(unevaluated) <= batch_size: + logger.info(f"剩余 {len(unevaluated)} 条未评估项目,全部返回") + return unevaluated + + # 随机选择指定数量 + selected = random.sample(unevaluated, batch_size) + logger.info(f"从 {len(unevaluated)} 条未评估项目中随机选择了 {len(selected)} 条") + return selected + + except Exception as e: + logger.error(f"获取未评估表达方式失败: {e}") + import traceback + logger.error(traceback.format_exc()) + return [] + + +def manual_evaluate_expression(expression: Expression, index: int, total: int) -> Dict: + """ + 人工评估单个表达方式 + + Args: + expression: 表达方式对象 + index: 当前索引(从1开始) + total: 总数 + + Returns: + 评估结果字典,如果用户退出则返回 None + """ + print("\n" + "=" * 60) + print(f"人工评估 [{index}/{total}]") + print("=" * 60) + print(f"Situation: {expression.situation}") + print(f"Style: {expression.style}") + print("\n请评估该表达方式是否合适:") + print(" 输入 'y' 或 'yes' 或 '1' 表示合适(通过)") + print(" 输入 'n' 或 'no' 或 '0' 表示不合适(不通过)") + print(" 输入 'q' 或 'quit' 退出评估") + print(" 输入 's' 或 'skip' 跳过当前项目") + + while True: + user_input = input("\n您的评估 (y/n/q/s): ").strip().lower() + + if user_input in ['q', 'quit']: + print("退出评估") + return None + + if user_input in ['s', 'skip']: + print("跳过当前项目") + return "skip" + + if user_input in ['y', 'yes', '1', '是', '通过']: + suitable = True + break + elif user_input in ['n', 'no', '0', '否', '不通过']: + suitable = False + break + else: + print("输入无效,请重新输入 (y/n/q/s)") + + result = { + "situation": expression.situation, + "style": expression.style, + "suitable": suitable, + "reason": None, + "evaluator": "manual", + "evaluated_at": datetime.now().isoformat() + } + + print(f"\n✓ 已记录:{'通过' if suitable else '不通过'}") + + return result + + +def main(): + """主函数""" + logger.info("=" * 60) + logger.info("开始表达方式人工评估") + logger.info("=" * 60) + + # 初始化数据库连接 + try: + db.connect(reuse_if_open=True) + logger.info("数据库连接成功") + except Exception as e: + logger.error(f"数据库连接失败: {e}") + return + + # 加载已有评估结果 + existing_results, evaluated_pairs = load_existing_results() + manual_results = existing_results.copy() + + if evaluated_pairs: + print(f"\n已加载 {len(existing_results)} 条已有评估结果") + print(f"已评估项目数: {len(evaluated_pairs)}") + + print("\n" + "=" * 60) + print("开始人工评估") + print("=" * 60) + print("提示:可以随时输入 'q' 退出,输入 's' 跳过当前项目") + print("评估结果会自动保存到文件\n") + + batch_size = 10 + batch_count = 0 + + while True: + # 获取未评估的项目 + expressions = get_unevaluated_expressions(evaluated_pairs, batch_size) + + if not expressions: + print("\n" + "=" * 60) + print("所有项目都已评估完成!") + print("=" * 60) + break + + batch_count += 1 + print(f"\n--- 批次 {batch_count}:评估 {len(expressions)} 条项目 ---") + + batch_results = [] + for i, expression in enumerate(expressions, 1): + manual_result = manual_evaluate_expression(expression, i, len(expressions)) + + if manual_result is None: + # 用户退出 + print("\n评估已中断") + if batch_results: + # 保存当前批次的结果 + manual_results.extend(batch_results) + save_results(manual_results) + return + + if manual_result == "skip": + # 跳过当前项目 + continue + + batch_results.append(manual_result) + # 使用 (situation, style) 作为唯一标识 + evaluated_pairs.add((manual_result["situation"], manual_result["style"])) + + # 将当前批次结果添加到总结果中 + manual_results.extend(batch_results) + + # 保存结果 + save_results(manual_results) + + print(f"\n当前批次完成,已评估总数: {len(manual_results)} 条") + + # 询问是否继续 + while True: + continue_input = input("\n是否继续评估下一批?(y/n): ").strip().lower() + if continue_input in ['y', 'yes', '1', '是', '继续']: + break + elif continue_input in ['n', 'no', '0', '否', '退出']: + print("\n评估结束") + return + else: + print("输入无效,请重新输入 (y/n)") + + # 关闭数据库连接 + try: + db.close() + logger.info("数据库连接已关闭") + except Exception as e: + logger.warning(f"关闭数据库连接时出错: {e}") + + +if __name__ == "__main__": + main() + diff --git a/scripts/evaluate_expressions_v5.py b/scripts/evaluate_expressions_v5.py new file mode 100644 index 00000000..0f1a814c --- /dev/null +++ b/scripts/evaluate_expressions_v5.py @@ -0,0 +1,476 @@ +""" +表达方式评估脚本 + +功能: +1. 随机读取指定数量的表达方式,获取其situation和style +2. 先进行人工评估(逐条手动评估) +3. 然后使用LLM进行评估 +4. 对比人工评估和LLM评估的正确率、精确率、召回率、F1分数等指标(以人工评估为标准) +5. 不真正修改数据库,只是做评估 +""" + +import asyncio +import random +import json +import sys +import os +from typing import List, Dict + +# 添加项目根目录到路径 +project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) +sys.path.insert(0, project_root) + +from src.common.database.database_model import Expression +from src.common.database.database import db +from src.llm_models.utils_model import LLMRequest +from src.config.config import model_config +from src.common.logger import get_logger + +logger = get_logger("expression_evaluator_comparison") + + +def get_random_expressions(count: int = 10) -> List[Expression]: + """ + 随机读取指定数量的表达方式 + + Args: + count: 要读取的数量,默认10条 + + Returns: + 表达方式列表 + """ + try: + # 查询所有表达方式 + all_expressions = list(Expression.select()) + + if not all_expressions: + logger.warning("数据库中没有表达方式记录") + return [] + + # 如果总数少于请求数量,返回所有 + if len(all_expressions) <= count: + logger.info(f"数据库中共有 {len(all_expressions)} 条表达方式,全部返回") + return all_expressions + + # 随机选择指定数量 + selected = random.sample(all_expressions, count) + logger.info(f"从 {len(all_expressions)} 条表达方式中随机选择了 {len(selected)} 条") + return selected + + except Exception as e: + logger.error(f"随机读取表达方式失败: {e}") + import traceback + logger.error(traceback.format_exc()) + return [] + + +def manual_evaluate_expression(expression: Expression, index: int, total: int) -> Dict: + """ + 人工评估单个表达方式 + + Args: + expression: 表达方式对象 + index: 当前索引(从1开始) + total: 总数 + + Returns: + 评估结果字典,包含: + - expression_id: 表达方式ID + - situation: 情境 + - style: 风格 + - suitable: 是否合适(人工评估) + - reason: 评估理由(始终为None) + """ + print("\n" + "=" * 60) + print(f"人工评估 [{index}/{total}]") + print("=" * 60) + print(f"Situation: {expression.situation}") + print(f"Style: {expression.style}") + print("\n请评估该表达方式是否合适:") + print(" 输入 'y' 或 'yes' 或 '1' 表示合适(通过)") + print(" 输入 'n' 或 'no' 或 '0' 表示不合适(不通过)") + print(" 输入 'q' 或 'quit' 退出评估") + + while True: + user_input = input("\n您的评估 (y/n/q): ").strip().lower() + + if user_input in ['q', 'quit']: + print("退出评估") + return None + + if user_input in ['y', 'yes', '1', '是', '通过']: + suitable = True + break + elif user_input in ['n', 'no', '0', '否', '不通过']: + suitable = False + break + else: + print("输入无效,请重新输入 (y/n/q)") + + result = { + "expression_id": expression.id, + "situation": expression.situation, + "style": expression.style, + "suitable": suitable, + "reason": None, + "evaluator": "manual" + } + + print(f"\n✓ 已记录:{'通过' if suitable else '不通过'}") + + return result + + +def create_evaluation_prompt(situation: str, style: str) -> str: + """ + 创建评估提示词 + + Args: + situation: 情境 + style: 风格 + + Returns: + 评估提示词 + """ + prompt = f"""请评估以下表达方式是否合适: + +情境(situation):{situation} +风格(style):{style} + +请从以下方面进行评估: +1. 情境描述是否清晰、准确 +2. 风格表达是否合理、自然 +3. 情境和风格是否匹配 +4. 允许部分语法错误出现 +5. 允许口头化或缺省表达 +6. 允许部分上下文缺失 + +请以JSON格式输出评估结果: +{{ + "suitable": true/false, + "reason": "评估理由(如果不合适,请说明原因)" +}} + +如果合适,suitable设为true;如果不合适,suitable设为false,并在reason中说明原因。 +请严格按照JSON格式输出,不要包含其他内容。""" + + return prompt + + +async def _single_llm_evaluation(expression: Expression, llm: LLMRequest) -> tuple[bool, str, str | None]: + """ + 执行单次LLM评估 + + Args: + expression: 表达方式对象 + llm: LLM请求实例 + + Returns: + (suitable, reason, error) 元组,如果出错则 suitable 为 False,error 包含错误信息 + """ + try: + prompt = create_evaluation_prompt(expression.situation, expression.style) + logger.debug(f"正在评估表达方式 ID: {expression.id}") + + response, (reasoning, model_name, _) = await llm.generate_response_async( + prompt=prompt, + temperature=0.6, + max_tokens=1024 + ) + + logger.debug(f"LLM响应: {response}") + + # 解析JSON响应 + try: + evaluation = json.loads(response) + except json.JSONDecodeError: + import re + json_match = re.search(r'\{[^{}]*"suitable"[^{}]*\}', response, re.DOTALL) + if json_match: + evaluation = json.loads(json_match.group()) + else: + raise ValueError("无法从响应中提取JSON格式的评估结果") + + suitable = evaluation.get("suitable", False) + reason = evaluation.get("reason", "未提供理由") + + logger.debug(f"评估结果: {'通过' if suitable else '不通过'}") + return suitable, reason, None + + except Exception as e: + logger.error(f"评估表达方式 ID: {expression.id} 时出错: {e}") + return False, f"评估过程出错: {str(e)}", str(e) + + +async def evaluate_expression_llm(expression: Expression, llm: LLMRequest) -> Dict: + """ + 使用LLM评估单个表达方式 + + Args: + expression: 表达方式对象 + llm: LLM请求实例 + + Returns: + 评估结果字典 + """ + logger.info(f"开始评估表达方式 ID: {expression.id}") + + suitable, reason, error = await _single_llm_evaluation(expression, llm) + + if error: + suitable = False + + logger.info(f"评估完成: {'通过' if suitable else '不通过'}") + + return { + "expression_id": expression.id, + "situation": expression.situation, + "style": expression.style, + "suitable": suitable, + "reason": reason, + "error": error, + "evaluator": "llm" + } + + +def compare_evaluations(manual_results: List[Dict], llm_results: List[Dict], method_name: str) -> Dict: + """ + 对比人工评估和LLM评估的结果 + + Args: + manual_results: 人工评估结果列表 + llm_results: LLM评估结果列表 + method_name: 评估方法名称(用于标识) + + Returns: + 对比分析结果字典 + """ + # 按expression_id建立映射 + llm_dict = {r["expression_id"]: r for r in llm_results} + + total = len(manual_results) + matched = 0 + true_positives = 0 + true_negatives = 0 + false_positives = 0 + false_negatives = 0 + + for manual_result in manual_results: + llm_result = llm_dict.get(manual_result["expression_id"]) + if llm_result is None: + continue + + manual_suitable = manual_result["suitable"] + llm_suitable = llm_result["suitable"] + + if manual_suitable == llm_suitable: + matched += 1 + + if manual_suitable and llm_suitable: + true_positives += 1 + elif not manual_suitable and not llm_suitable: + true_negatives += 1 + elif not manual_suitable and llm_suitable: + false_positives += 1 + elif manual_suitable and not llm_suitable: + false_negatives += 1 + + accuracy = (matched / total * 100) if total > 0 else 0 + precision = (true_positives / (true_positives + false_positives) * 100) if (true_positives + false_positives) > 0 else 0 + recall = (true_positives / (true_positives + false_negatives) * 100) if (true_positives + false_negatives) > 0 else 0 + f1_score = (2 * precision * recall / (precision + recall)) if (precision + recall) > 0 else 0 + specificity = (true_negatives / (true_negatives + false_positives) * 100) if (true_negatives + false_positives) > 0 else 0 + + random_baseline = 50.0 + accuracy_above_random = accuracy - random_baseline + accuracy_improvement_ratio = (accuracy / random_baseline) if random_baseline > 0 else 0 + + return { + "method": method_name, + "total": total, + "matched": matched, + "accuracy": accuracy, + "accuracy_above_random": accuracy_above_random, + "accuracy_improvement_ratio": accuracy_improvement_ratio, + "true_positives": true_positives, + "true_negatives": true_negatives, + "false_positives": false_positives, + "false_negatives": false_negatives, + "precision": precision, + "recall": recall, + "f1_score": f1_score, + "specificity": specificity + } + + + + +async def main(): + """主函数""" + logger.info("=" * 60) + logger.info("开始表达方式评估") + logger.info("=" * 60) + + # 初始化数据库连接 + try: + db.connect(reuse_if_open=True) + logger.info("数据库连接成功") + except Exception as e: + logger.error(f"数据库连接失败: {e}") + return + + # 1. 随机读取表达方式 + logger.info("\n步骤1: 随机读取表达方式") + expressions = get_random_expressions(10) + if not expressions: + logger.error("没有可用的表达方式,退出") + return + logger.info(f"成功读取 {len(expressions)} 条表达方式") + + # 2. 人工评估 + print("\n" + "=" * 60) + print("开始人工评估") + print("=" * 60) + print(f"共需要评估 {len(expressions)} 条表达方式") + print("请逐条进行评估...\n") + + manual_results = [] + for i, expression in enumerate(expressions, 1): + manual_result = manual_evaluate_expression(expression, i, len(expressions)) + if manual_result is None: + print("\n评估已中断") + return + manual_results.append(manual_result) + + print("\n" + "=" * 60) + print("人工评估完成") + print("=" * 60) + + # 3. 创建LLM实例并评估 + logger.info("\n步骤3: 创建LLM实例") + try: + llm = LLMRequest( + model_set=model_config.model_task_config.tool_use, + request_type="expression_evaluator_comparison" + ) + except Exception as e: + logger.error(f"创建LLM实例失败: {e}") + import traceback + logger.error(traceback.format_exc()) + return + + logger.info("\n步骤4: 开始LLM评估") + llm_results = [] + for i, expression in enumerate(expressions, 1): + logger.info(f"LLM评估进度: {i}/{len(expressions)}") + llm_results.append(await evaluate_expression_llm(expression, llm)) + await asyncio.sleep(0.3) + + # 4. 对比分析并输出结果 + comparison = compare_evaluations(manual_results, llm_results, "LLM评估") + + print("\n" + "=" * 60) + print("评估结果(以人工评估为标准)") + print("=" * 60) + print("\n评估目标:") + print(" 1. 核心能力:将不合适的项目正确提取出来(特定负类召回率)") + print(" 2. 次要能力:尽可能少的误删合适的项目(召回率)") + + # 详细评估结果(核心指标优先) + print("\n【详细对比】") + print(f"\n--- {comparison['method']} ---") + print(f" 总数: {comparison['total']} 条") + print() + print(" 【核心能力指标】") + print(f" ⭐ 特定负类召回率: {comparison['specificity']:.2f}% (将不合适项目正确提取出来的能力)") + print(f" - 计算: TN / (TN + FP) = {comparison['true_negatives']} / ({comparison['true_negatives']} + {comparison['false_positives']})") + print(f" - 含义: 在 {comparison['true_negatives'] + comparison['false_positives']} 个实际不合适的项目中,正确识别出 {comparison['true_negatives']} 个") + print(f" - 随机水平: 50.00% (当前高于随机: {comparison['specificity'] - 50.0:+.2f}%)") + print() + print(f" ⭐ 召回率: {comparison['recall']:.2f}% (尽可能少的误删合适项目的能力)") + print(f" - 计算: TP / (TP + FN) = {comparison['true_positives']} / ({comparison['true_positives']} + {comparison['false_negatives']})") + print(f" - 含义: 在 {comparison['true_positives'] + comparison['false_negatives']} 个实际合适的项目中,正确识别出 {comparison['true_positives']} 个") + print(f" - 随机水平: 50.00% (当前高于随机: {comparison['recall'] - 50.0:+.2f}%)") + print() + print(" 【其他指标】") + print(f" 准确率: {comparison['accuracy']:.2f}% (整体判断正确率)") + print(f" 精确率: {comparison['precision']:.2f}% (判断为合适的项目中,实际合适的比例)") + print(f" F1分数: {comparison['f1_score']:.2f} (精确率和召回率的调和平均)") + print(f" 匹配数: {comparison['matched']}/{comparison['total']}") + print() + print(" 【分类统计】") + print(f" TP (正确识别为合适): {comparison['true_positives']}") + print(f" TN (正确识别为不合适): {comparison['true_negatives']} ⭐") + print(f" FP (误判为合适): {comparison['false_positives']} ⚠️") + print(f" FN (误删合适项目): {comparison['false_negatives']} ⚠️") + + # 5. 输出人工评估不通过但LLM误判为通过的详细信息 + print("\n" + "=" * 60) + print("人工评估不通过但LLM误判为通过的项目(FP - False Positive)") + print("=" * 60) + + # 按expression_id建立映射 + llm_dict = {r["expression_id"]: r for r in llm_results} + + fp_items = [] + for manual_result in manual_results: + llm_result = llm_dict.get(manual_result["expression_id"]) + if llm_result is None: + continue + + # 人工评估不通过,但LLM评估通过(FP情况) + if not manual_result["suitable"] and llm_result["suitable"]: + fp_items.append({ + "expression_id": manual_result["expression_id"], + "situation": manual_result["situation"], + "style": manual_result["style"], + "manual_suitable": manual_result["suitable"], + "llm_suitable": llm_result["suitable"], + "llm_reason": llm_result.get("reason", "未提供理由"), + "llm_error": llm_result.get("error") + }) + + if fp_items: + print(f"\n共找到 {len(fp_items)} 条误判项目:\n") + for idx, item in enumerate(fp_items, 1): + print(f"--- [{idx}] 项目 ID: {item['expression_id']} ---") + print(f"Situation: {item['situation']}") + print(f"Style: {item['style']}") + print("人工评估: 不通过 ❌") + print("LLM评估: 通过 ✅ (误判)") + if item.get('llm_error'): + print(f"LLM错误: {item['llm_error']}") + print(f"LLM理由: {item['llm_reason']}") + print() + else: + print("\n✓ 没有误判项目(所有人工评估不通过的项目都被LLM正确识别为不通过)") + + # 6. 保存结果到JSON文件 + output_file = os.path.join(project_root, "data", "expression_evaluation_comparison.json") + try: + os.makedirs(os.path.dirname(output_file), exist_ok=True) + with open(output_file, "w", encoding="utf-8") as f: + json.dump({ + "manual_results": manual_results, + "llm_results": llm_results, + "comparison": comparison + }, f, ensure_ascii=False, indent=2) + logger.info(f"\n评估结果已保存到: {output_file}") + except Exception as e: + logger.warning(f"保存结果到文件失败: {e}") + + print("\n" + "=" * 60) + print("评估完成") + print("=" * 60) + + # 关闭数据库连接 + try: + db.close() + logger.info("数据库连接已关闭") + except Exception as e: + logger.warning(f"关闭数据库连接时出错: {e}") + + +if __name__ == "__main__": + asyncio.run(main()) + diff --git a/src/memory_system/memory_retrieval.py b/src/memory_system/memory_retrieval.py index b4f9b21e..a893b3cc 100644 --- a/src/memory_system/memory_retrieval.py +++ b/src/memory_system/memory_retrieval.py @@ -521,6 +521,7 @@ async def _react_agent_solve_question( logger.warning(f"{react_log_prefix}第 {iteration + 1} 次迭代 无工具调用且无响应") step["observations"] = ["无响应且无工具调用"] thinking_steps.append(step) + iteration += 1 # 在continue之前增加迭代计数,避免跳过iteration += 1 continue # 处理工具调用 @@ -1021,6 +1022,11 @@ async def _process_single_question( Returns: Optional[str]: 如果找到答案,返回格式化的结果字符串,否则返回None """ + # 如果question为空或None,直接返回None,不进行查询 + if not question or not question.strip(): + logger.debug("问题为空,跳过查询") + return None + # logger.info(f"开始处理问题: {question}") _cleanup_stale_not_found_thinking_back() @@ -1116,15 +1122,14 @@ async def build_memory_retrieval_prompt( recent_query_history = "最近没有查询记录。" # 第一步:生成问题或使用 Planner 提供的问题 - questions = [] + single_question: Optional[str] = None # 如果 planner_question 配置开启,只使用 Planner 提供的问题,不使用旧模式 if global_config.memory.planner_question: if question and isinstance(question, str) and question.strip(): # 清理和验证 question - cleaned_question = question.strip() - questions = [cleaned_question] - logger.info(f"{log_prefix}使用 Planner 提供的 question: {cleaned_question}") + single_question = question.strip() + logger.info(f"{log_prefix}使用 Planner 提供的 question: {single_question}") else: # planner_question 开启但没有提供 question,跳过记忆检索 logger.debug(f"{log_prefix}planner_question 已开启但未提供 question,跳过记忆检索") @@ -1157,10 +1162,11 @@ async def build_memory_retrieval_prompt( logger.error(f"{log_prefix}LLM生成问题失败: {response}") return "" - # 解析概念列表和问题列表 + # 解析概念列表和问题列表,只取第一个问题 _, questions = parse_questions_json(response) - if questions: - logger.info(f"{log_prefix}解析到 {len(questions)} 个问题: {questions}") + if questions and len(questions) > 0: + single_question = questions[0].strip() + logger.info(f"{log_prefix}解析到问题: {single_question}") # 初始阶段:使用 Planner 提供的 unknown_words 进行检索(如果提供) initial_info = "" @@ -1183,13 +1189,13 @@ async def build_memory_retrieval_prompt( else: logger.debug(f"{log_prefix}unknown_words 检索未找到任何结果") - if not questions: + if not single_question: logger.debug(f"{log_prefix}模型认为不需要检索记忆或解析失败,不返回任何查询结果") end_time = time.time() logger.info(f"{log_prefix}无当次查询,不返回任何结果,耗时: {(end_time - start_time):.3f}秒") return "" - # 第二步:并行处理所有问题(使用配置的最大迭代次数和超时时间) + # 第二步:处理问题(使用配置的最大迭代次数和超时时间) base_max_iterations = global_config.memory.max_agent_iterations # 根据think_level调整迭代次数:think_level=1时不变,think_level=0时减半 if think_level == 0: @@ -1198,31 +1204,21 @@ async def build_memory_retrieval_prompt( max_iterations = base_max_iterations timeout_seconds = global_config.memory.agent_timeout_seconds logger.debug( - f"{log_prefix}问题数量: {len(questions)},think_level={think_level},设置最大迭代次数: {max_iterations}(基础值: {base_max_iterations}),超时时间: {timeout_seconds}秒" + f"{log_prefix}问题: {single_question},think_level={think_level},设置最大迭代次数: {max_iterations}(基础值: {base_max_iterations}),超时时间: {timeout_seconds}秒" ) - # 并行处理所有问题 - question_tasks = [ - _process_single_question( - question=question, + # 处理单个问题 + try: + result = await _process_single_question( + question=single_question, chat_id=chat_id, context=message, initial_info=initial_info, max_iterations=max_iterations, ) - for question in questions - ] - - # 并行执行所有查询任务 - results = await asyncio.gather(*question_tasks, return_exceptions=True) - - # 收集所有有效结果 - question_results: List[str] = [] - for i, result in enumerate(results): - if isinstance(result, Exception): - logger.error(f"{log_prefix}处理问题 '{questions[i]}' 时发生异常: {result}") - elif result is not None: - question_results.append(result) + except Exception as e: + logger.error(f"{log_prefix}处理问题 '{single_question}' 时发生异常: {e}") + result = None # 获取最近10分钟内已找到答案的缓存记录 cached_answers = _get_recent_found_answers(chat_id, time_window_seconds=600.0) @@ -1231,29 +1227,29 @@ async def build_memory_retrieval_prompt( all_results = [] # 先添加当前查询的结果 - current_questions = set() - for result in question_results: + current_question = None + if result: + all_results.append(result) # 提取问题(格式为 "问题:xxx\n答案:xxx") if result.startswith("问题:"): question_end = result.find("\n答案:") if question_end != -1: - current_questions.add(result[4:question_end]) - all_results.append(result) + current_question = result[4:question_end] - # 添加缓存答案(排除当前查询中已存在的问题) + # 添加缓存答案(排除当前查询的问题) for cached_answer in cached_answers: if cached_answer.startswith("问题:"): question_end = cached_answer.find("\n答案:") if question_end != -1: cached_question = cached_answer[4:question_end] - if cached_question not in current_questions: + if cached_question != current_question: all_results.append(cached_answer) end_time = time.time() if all_results: retrieved_memory = "\n\n".join(all_results) - current_count = len(question_results) + current_count = 1 if result else 0 cached_count = len(all_results) - current_count logger.info( f"{log_prefix}记忆检索成功,耗时: {(end_time - start_time):.3f}秒," @@ -1261,7 +1257,7 @@ async def build_memory_retrieval_prompt( ) return f"你回忆起了以下信息:\n{retrieved_memory}\n如果与回复内容相关,可以参考这些回忆的信息。\n" else: - logger.debug(f"{log_prefix}所有问题均未找到答案,且无缓存答案") + logger.debug(f"{log_prefix}问题未找到答案,且无缓存答案") return "" except Exception as e: diff --git a/template/model_config_template.toml b/template/model_config_template.toml index c9ba015c..f9fae003 100644 --- a/template/model_config_template.toml +++ b/template/model_config_template.toml @@ -141,13 +141,13 @@ temperature = 0.2 # 模型温度,新V3建议0.1-0.3 max_tokens = 4096 # 最大输出token数 slow_threshold = 15.0 # 慢请求阈值(秒),模型等待回复时间超过此值会输出警告日志 -[model_task_config.tool_use] #工具调用模型,需要使用支持工具调用的模型 +[model_task_config.tool_use] #功能模型,需要使用支持工具调用的模型,请使用较快的小模型(调用量较大) model_list = ["qwen3-30b","qwen3-next-80b"] temperature = 0.7 -max_tokens = 800 +max_tokens = 1024 slow_threshold = 10.0 -[model_task_config.replyer] # 首要回复模型,还用于表达器和表达方式学习 +[model_task_config.replyer] # 首要回复模型,还用于表达方式学习 model_list = ["siliconflow-deepseek-v3.2","siliconflow-deepseek-v3.2-think","siliconflow-glm-4.6","siliconflow-glm-4.6-think"] temperature = 0.3 # 模型温度,新V3建议0.1-0.3 max_tokens = 2048