Commit 5f18aa67 by ccran

feat:付款时间审查答案更新;付款时间审查F1达标;

parent 3bb9ff31
...@@ -4,53 +4,61 @@ from dataclasses import dataclass ...@@ -4,53 +4,61 @@ from dataclasses import dataclass
# 可配置运行参数 # 可配置运行参数
use_docker = False use_docker = False
@dataclass @dataclass
class LLMConfig: class LLMConfig:
base_url: str = "http://192.168.252.71:9002/v1" base_url: str = "http://192.168.252.71:9002/v1"
api_key: str = "none" api_key: str = "none"
model: str = 'Qwen2-72B-Instruct' model: str = "Qwen2-72B-Instruct"
# MAX_SINGLE_CHUNK_SIZE=100000 # MAX_SINGLE_CHUNK_SIZE=100000
MERGE_RULE_PROMPT = False MERGE_RULE_PROMPT = False
MAX_SINGLE_CHUNK_SIZE=5000 MAX_SINGLE_CHUNK_SIZE = 5000
META_KEY="META" META_KEY = "META"
DEFAULT_RULESET_ID = "通用" DEFAULT_RULESET_ID = "通用"
ALL_RULESET_IDS = ["通用","借款","担保","财务口","金盘","金盘简化"] ALL_RULESET_IDS = ["通用", "借款", "担保", "财务口", "金盘", "金盘简化"]
use_lufa = False use_lufa = False
if use_lufa: if use_lufa:
outer_backend_url = "http://znkf.lgfzgroup.com:48081" outer_backend_url = "http://znkf.lgfzgroup.com:48081"
base_fastgpt_url = "http://192.168.252.71:18089" base_fastgpt_url = "http://192.168.252.71:18089"
base_backend_url = "http://192.168.252.71:48081" base_backend_url = "http://192.168.252.71:48081"
segment_review_api_key = "fastgpt-zMavJKKgqA9jRNHLXxzXCVZx1JXxfuNkH1p2qfLhtPfMp41UvdSQvt8" segment_review_api_key = (
reflect_retry_api_key = "fastgpt-ao3al2vgfnArt9qi2bTpPeRHouCO7qngUZiQsIM1E2x91u22z65J" "fastgpt-zMavJKKgqA9jRNHLXxzXCVZx1JXxfuNkH1p2qfLhtPfMp41UvdSQvt8"
)
reflect_retry_api_key = (
"fastgpt-ao3al2vgfnArt9qi2bTpPeRHouCO7qngUZiQsIM1E2x91u22z65J"
)
else: else:
outer_backend_url = "http://218.77.58.8:48080" outer_backend_url = "http://218.77.58.8:48080"
base_fastgpt_url = "http://192.168.252.71:18088" base_fastgpt_url = "http://192.168.252.71:18088"
base_backend_url = "http://192.168.252.71:48080" base_backend_url = "http://192.168.252.71:48080"
segment_review_api_key = "fastgpt-vLu2JHAfqwEq5FUQhvATFDK0yDS6fs804v7KwWBMyU4sRrHzh4UGl89Zpa" segment_review_api_key = (
reflect_retry_api_key = "fastgpt-abxzi4CC7SGuVdxDVFmhAFFQHqi6owK5YsIfXdvOMEAcpIhZWDPObTz2Xn" "fastgpt-vLu2JHAfqwEq5FUQhvATFDK0yDS6fs804v7KwWBMyU4sRrHzh4UGl89Zpa"
)
reflect_retry_api_key = (
"fastgpt-abxzi4CC7SGuVdxDVFmhAFFQHqi6owK5YsIfXdvOMEAcpIhZWDPObTz2Xn"
)
# 项目根目录 # 项目根目录
root_path = r"E:\PycharmProject\contract_review_agent" root_path = r"E:\PycharmProject\contract_review_agent"
system = platform.system() system = platform.system()
if system == "Linux": if system == "Linux":
# root_path = "/data/home/ccran/contract_review_agent" # root_path = "/data/home/ccran/contract_review_agent"
root_path = '/home/ccran/contract_review_agent' root_path = "/home/ccran/contract_review_agent"
elif system == "Darwin": elif system == "Darwin":
root_path = "/Users/chenran/PycharmProjects/contract_review_agent" root_path = "/Users/chenran/PycharmProjects/contract_review_agent"
# docker设置 # docker设置
if use_docker: if use_docker:
root_path = '/app' root_path = "/app"
MAX_WORKERS = 20 MAX_WORKERS = 20
LLM = { LLM = {
"base_tool_llm": LLMConfig(), "base_tool_llm": LLMConfig(),
"fastgpt_segment_review": LLMConfig( "fastgpt_segment_review": LLMConfig(
base_url=f"{base_fastgpt_url}/api/v1", base_url=f"{base_fastgpt_url}/api/v1", api_key=segment_review_api_key
api_key=segment_review_api_key
), ),
"fastgpt_reflect_retry": LLMConfig( "fastgpt_reflect_retry": LLMConfig(
base_url=f"{base_fastgpt_url}/api/v1", base_url=f"{base_fastgpt_url}/api/v1", api_key=reflect_retry_api_key
api_key=reflect_retry_api_key
), ),
} }
doc_support_formats = [".docx", ".doc", ".wps"] doc_support_formats = [".docx", ".doc", ".wps"]
......
...@@ -9,30 +9,108 @@ from core.tools.segment_llm import LLMTool ...@@ -9,30 +9,108 @@ from core.tools.segment_llm import LLMTool
REFLECT_SYSTEM_PROMPT = ''' REFLECT_SYSTEM_PROMPT = '''
你是一个合同审查反思智能体(ReviewReflection)。 你是一个合同审查反思智能体(ReviewReflection)。
你的任务不是重新审查合同,而是基于已有 findings、当前审查规则和合同全文,对 findings 进行复核,输出最终 findings。 你的任务不是从零重新审查合同,也不是简单删减 findings,
而是基于“已有 findings、当前审查规则、合同全文、合同摘要事实记忆”,
对 findings 进行规则内复核、去重、校正、拆分、合并与定稿,输出最终 final_findings。
【通用反思任务】 【你的角色定位】
你只能做以下事情: 你是“终审校准器”,不是“初审生成器”。
你的目标是让 final_findings 同时满足以下要求:
1. 与当前审查规则严格相关;
2. 能被合同全文直接支持;
3. 不重复、不冲突;
4. 表述准确、建议可执行;
5. 对已有 findings 中已经涉及的规则问题做到完整定稿,而不是机械保留或机械删除。
【允许执行的操作】
你只能在“已有 findings 已涉及的规则范围内”做以下处理:
1. 删除重复 findings; 1. 删除重复 findings;
2. 删除证据不足不能由合同原文直接支持的 findings; 2. 删除证据不足、引用不当、不能由合同原文直接支持的 findings;
3. 删除超出当前审查规则范围的 findings; 3. 删除超出当前审查规则范围的 findings;
4. 修订 issue、result 或 suggestion 不准确的 findings; 4. 修订 issue、result、original_text 或 suggestion 不准确的 findings;
5. 合并指向同一原文实质问题的 findings; 5. 合并多个指向同一原文实质问题的 findings;
6. 基于合同全文对已有 findings 做必要校正。 6. 拆分一个同时包含多个独立问题的 finding,将其改写为多个 final findings;
当出现以下情况时,必须拆分:
- original_text 包含多个句子或多个不连续片段;
- 一个 finding 的 issue 实际对应多个独立风险点;
- 不同句子分别支撑不同判断;
拆分要求:
- 每个拆分后的 finding 只保留一个独立问题;
- 每个 finding 的 original_text 只引用一个最小充分证据句;
- 不得在一个 finding 中保留多个证据来源;
7. 基于合同全文对已有 findings 做必要校正;
8. 在不扩展新审查维度的前提下,对已有 findings 中已经涉及但表达混杂、粒度过粗、遗漏独立结论的内容进行重组和细化。
【结构违规检测(必须执行)】
你必须检查每一个已有 finding 是否违反以下结构规则:
1. original_text 是否超过一个句子?
2. 是否包含多个不连续文本片段?
3. issue 是否描述了多个问题?
4. suggestion 是否同时针对多个问题?
如果任一为“是”,则该 finding 必须被拆分为多个 final findings。
【禁止事项】
你不得:
- 脱离当前审查规则新增全新的审查维度;
- 凭空创造合同中不存在的事实;
- 仅因措辞保守就删除一个本来成立的 finding;
- 仅因已有 findings 数量较多就刻意压缩结果数量;
- 输出无法由合同原文直接支持的结论;
- 输出模糊、空泛、不可执行的 suggestion。
通用判定原则】 核心判定原则】
- findings 只是候选结论,不当然等于最终结论; - findings 只是候选结论,不当然等于最终结论;
- final result 必须以合同全文和当前审查规则为准; - final result 必须以“当前审查规则 + 合同全文 + 合同立场”为准;
- 每条 final finding 必须有合同原文直接引用 - 每条 final finding 必须能被合同原文直接支持
- original_text 必须能够直接支撑该 finding - original_text 必须是能够直接支撑该 finding 的最小充分证据片段
- result 只能为“合格”或“不合格”; - result 只能为“合格”或“不合格”;
- 若 result 为“合格”,suggestion 填写“无需修改”; - 若 result 为“合格”,suggestion 必须填写“无需修改”;
- 若 result 为“不合格”,suggestion 必须具体、可执行; - 若 result 为“不合格”,suggestion 必须具体、可执行,优先给出可直接替换或新增的条款表述;若无法安全直接改写,则明确指出应补充的关键要素。
- 若反思后无成立 findings,返回 {"final_findings": []}。
【全文校正规则(非常重要)】
你必须结合合同全文检查每条已有 finding 是否存在以下情况:
1. 该问题在合同其他部分已有明确补充、限制、例外或纠正;
2. 该 finding 对原文存在断章取义;
3. 该 finding 忽略了适用条件、前提、例外或定义;
4. 该 finding 的 original_text 不能直接支撑其 issue 或 result;
5. 该 finding 与合同立场下的风险判断不一致;
6. 两条 findings 看似不同,但实质上指向同一风险;
7. 一条 finding 看似一条,实际上包含多个独立成立的判断,应拆分。
若合同全文已经对某一已有风险作出充分补正或限制,导致该 finding 不再成立,则应删除或修订,而不是机械保留。
【完整性要求】
反思的目标不是尽量减少 findings,而是输出“准确、去重、完整”的 final_findings。
如果已有 findings 中实际上包含多个独立成立的问题,必须在 final_findings 中完整呈现,不得因为反思阶段而无故收缩为1条。
【内部执行步骤(不得输出)】
在输出最终 JSON 前,你必须完成以下内部步骤:
Step 1:逐条审阅已有 findings,判断其是否仍成立;
Step 2:检查每条 finding 是否与当前规则相关,是否有合同原文直接支持;
Step 3:结合合同全文核验该 finding 是否被其他条款补充、限制、修正或否定;
Step 4:识别重复项、交叉项、包含多个问题的混合项;
Step 5:对 findings 进行删除、修订、合并或拆分;
Step 6:确保 final_findings 中每一条都可独立成立,且合并后不遗漏已有 findings 所涉及的有效问题;
Step 7:再输出最终 JSON。
【输出约束】 【输出约束】
- 严格输出 JSON; - 严格输出 JSON;
- 不得输出任何解释性文字。 - 不得输出任何解释性文字;
- 若反思后无成立 findings,返回 {"final_findings": []}。
在输出 final_findings 前,你必须逐条自检(不输出):
1. 这条 finding 是否仍在当前审查规则范围内?
2. original_text 是否真的能直接支持 issue 和 result?
3. issue 是否准确说明了为什么合格/不合格?
4. 是否被合同全文其他条款补正、限制或推翻?
5. 是否与其他 finding 重复?
6. 是否其实包含多个独立问题,需要拆分?
7. suggestion 是否具体、可执行、与 result 一致?
8. 若 result=合格,suggestion 是否为“无需修改”?
9. 删除、合并、拆分后,是否遗漏了已有 findings 中本来成立的有效问题?
''' '''
REFLECT_USER_PROMPT = ''' REFLECT_USER_PROMPT = '''
...@@ -52,17 +130,25 @@ REFLECT_USER_PROMPT = ''' ...@@ -52,17 +130,25 @@ REFLECT_USER_PROMPT = '''
站在 {party_role} 的立场进行反思审查。 站在 {party_role} 的立场进行反思审查。
【任务】 【任务】
请基于通用反思要求、当前审查规则、规则专属反思思路、已有 findings 和合同全文, 请基于当前审查规则、规则专属反思思路、已有 findings、合同摘要事实记忆和合同全文,
输出最终 final_findings。 对已有 findings 进行规则内复核、校正、去重、合并、拆分与定稿,输出最终 final_findings。
【特别要求】 【特别要求】
- 仅在已有 findings 的基础上做复核、删除、修订、合并; - 你不是从零重新审查,而是对已有 findings 做终审校准;
- 不得扩展新的审查维度; - 不得扩展新的审查维度,但可以在已有 findings 已涉及的规则范围内进行修订、合并、拆分和重组;
- 必须结合合同全文进行校正; - 必须结合合同全文进行校正,防止断章取义;
- 若合同其他条款已对某一风险作出明确补充、限制、例外或修正,应据此删除或修订对应 finding;
- 若一个 finding 实际包含多个独立问题,应拆分为多个 final findings;
- 若多个 findings 实际指向同一问题,应合并;
- 每条 final finding 必须包含 result,且 result 只能为“合格”或“不合格”; - 每条 final finding 必须包含 result,且 result 只能为“合格”或“不合格”;
- 每条 final finding 的 original_text 必须是能够直接支撑该判断的合同原文最小充分证据片段;
- 当 result="合格" 时,suggestion 必须填写“无需修改”;
- 当 result="不合格" 时,suggestion 应尽量提供可直接落地的修改文本;若无法安全直接改写,请给出明确修改方向和应补充的关键要素;
- final_findings 应准确、去重、完整,不得无故少于已有 findings 中实际成立的独立问题数量;
- 若无成立 findings,返回 {{"final_findings": []}}; - 若无成立 findings,返回 {{"final_findings": []}};
- 仅输出 JSON。 - 仅输出 JSON。
''' '''
OUTPUT_FORMAT_SCHEMA = ''' OUTPUT_FORMAT_SCHEMA = '''
```json ```json
{ {
...@@ -98,12 +184,23 @@ class ReflectRetryTool(LLMTool): ...@@ -98,12 +184,23 @@ class ReflectRetryTool(LLMTool):
"required": ["party_role", "rule", "facts", "findings"], "required": ["party_role", "rule", "facts", "findings"],
} }
) )
def _stringify_rule(self, rule:Dict) -> str:
res = ''
res += f"## 审查项标题\n{rule.get('title','')}\n"
res += f"## 审查规则\n{rule.get('rule','')}\n"
res += f"## 风险等级\n{rule.get('level','')}\n"
res += f"## 建议模板\n{rule.get('suggestion_template','')}\n"
res += f"## 参考案例\n{rule.get('case','')}\n"
return res
def run(self, party_role: str, rule: Dict, facts: Optional[List[Dict]] = None, findings: Optional[List[Dict]] = None) -> List[Dict]: def run(self, party_role: str, rule: Dict, facts: Optional[List[Dict]] = None, findings: Optional[List[Dict]] = None) -> List[Dict]:
base_findings = self._build_findings_with_ids(findings or []) base_findings = self._build_findings_with_ids(findings or [])
if len(base_findings) == 0: if len(base_findings) == 0:
return [] return []
user_content = REFLECT_USER_PROMPT.format( user_content = REFLECT_USER_PROMPT.format(
rule=rule.get("rule",""), rule=self._stringify_rule(rule),
findings_json=json.dumps(base_findings, ensure_ascii=False), findings_json=json.dumps(base_findings, ensure_ascii=False),
facts_json=json.dumps(facts or [], ensure_ascii=False), facts_json=json.dumps(facts or [], ensure_ascii=False),
party_role=party_role, party_role=party_role,
......
...@@ -7,6 +7,7 @@ from typing import Dict, List, Optional ...@@ -7,6 +7,7 @@ from typing import Dict, List, Optional
from core.tool import tool, tool_func from core.tool import tool, tool_func
from core.tools.segment_llm import LLMTool from core.tools.segment_llm import LLMTool
from core.config import META_KEY from core.config import META_KEY
from loguru import logger
SUMMARY_SYSTEM_PROMPT = f''' SUMMARY_SYSTEM_PROMPT = f'''
你是合同事实提取智能体(SegmentSummary)。 你是合同事实提取智能体(SegmentSummary)。
...@@ -183,7 +184,7 @@ class SegmentSummaryTool(LLMTool): ...@@ -183,7 +184,7 @@ class SegmentSummaryTool(LLMTool):
data = self.parse_first_json(resp) data = self.parse_first_json(resp)
facts = data.get("facts") or {} facts = data.get("facts") or {}
except Exception as e: except Exception as e:
print(f'Error in segment summary for segment {segment_id}: {e}') logger.info(f'Error in segment summary for segment {segment_id}: {e}')
facts = {} facts = {}
facts[META_KEY] = { facts[META_KEY] = {
"segment_id": segment_id, "segment_id": segment_id,
......
...@@ -12,18 +12,21 @@ from loguru import logger ...@@ -12,18 +12,21 @@ from loguru import logger
from utils.common_util import random_str from utils.common_util import random_str
from utils.http_util import upload_file, fastgpt_openai_chat, download_file from utils.http_util import upload_file, fastgpt_openai_chat, download_file
SUFFIX='_麓发迁移' # SUFFIX='_麓发迁移'
batch_input_dir_path = 'jp-input' # batch_input_dir_path = 'jp-input'
batch_output_dir_path = 'jp-output-lufa-simple-new' # batch_output_dir_path = 'jp-output-lufa-new'
SUFFIX='_麓发'
batch_input_dir_path = 'lufa-input'
batch_output_dir_path = 'lufa-output'
batch_size = 5 batch_size = 5
# 麓发fastgpt接口 # 麓发fastgpt接口
# url = 'http://192.168.252.71:18089/api/v1/chat/completions' url = 'http://192.168.252.71:18089/api/v1/chat/completions'
# 金盘fastgpt接口 # 金盘fastgpt接口
url = 'http://192.168.252.71:18088/api/v1/chat/completions' # url = 'http://192.168.252.71:18088/api/v1/chat/completions'
# 麓发合同审查生产token # 麓发合同审查生产token
# token = 'fastgpt-ek3Z6PxI6sXgYc0jxzZ5bVGqrxwM6aVyfSmA6JVErJYBMr2KmYxrHwEUOIMSYz' token = 'fastgpt-ek3Z6PxI6sXgYc0jxzZ5bVGqrxwM6aVyfSmA6JVErJYBMr2KmYxrHwEUOIMSYz'
# 金盘迁移麓发合同审查测试token # 金盘迁移麓发合同审查测试token
token = 'fastgpt-vykT6qs07g7hR4tL2MNJE6DdNCIxaQjEu3Cxw9nuTBFg8MAG3CkByvnXKxSNEyMK7' # token = 'fastgpt-vykT6qs07g7hR4tL2MNJE6DdNCIxaQjEu3Cxw9nuTBFg8MAG3CkByvnXKxSNEyMK7'
# 人机交互测试(测试环境) # 人机交互测试(测试环境)
# token = 'fastgpt-p189K5zoTX5wjp0dBybFCwsbWm3juIwlJxt2wTGyiaOWOANI5Y10pKEZzyt' # token = 'fastgpt-p189K5zoTX5wjp0dBybFCwsbWm3juIwlJxt2wTGyiaOWOANI5Y10pKEZzyt'
# 人机交互测试(生产环境) # 人机交互测试(生产环境)
......
...@@ -2,9 +2,9 @@ from spire.doc import * ...@@ -2,9 +2,9 @@ from spire.doc import *
from spire.doc.common import * from spire.doc.common import *
# 创建一个 Document 类对象并加载一个 Word 文档 # 创建一个 Document 类对象并加载一个 Word 文档
benchmark_path = '/home/ccran/contract_review_agent/benchmark' benchmark_path = "/home/ccran/contract_review_agent/benchmark"
datasets_path = f'{benchmark_path}/datasets' datasets_path = f"{benchmark_path}/datasets"
clean_path = f'{benchmark_path}/clean' clean_path = f"{benchmark_path}/clean"
items = os.listdir(datasets_path) items = os.listdir(datasets_path)
for item in items: for item in items:
# 创建一个 Document 类的对象 # 创建一个 Document 类的对象
......
...@@ -101,6 +101,9 @@ def _compare_impl(val_dir: Path, answer_dir: Path) -> None: ...@@ -101,6 +101,9 @@ def _compare_impl(val_dir: Path, answer_dir: Path) -> None:
unmatched_val_count = sum(len(v) for v in unmatched_val_by_item.values()) unmatched_val_count = sum(len(v) for v in unmatched_val_by_item.values())
unmatched_answer_count = sum(len(v) for v in unmatched_answer_by_item.values()) unmatched_answer_count = sum(len(v) for v in unmatched_answer_by_item.values())
file_precision = (matched_total / val_total) if val_total != 0 else 0
file_recall = (matched_total / answer_total) if answer_total != 0 else 0
file_f1 = (2 * file_precision * file_recall / (file_precision + file_recall)) if (file_precision + file_recall) else 0
file_false_positive_rate = (unmatched_val_count / val_total) if val_total != 0 else 0 file_false_positive_rate = (unmatched_val_count / val_total) if val_total != 0 else 0
# 累加到各“审查项”的全局统计 # 累加到各“审查项”的全局统计
...@@ -115,8 +118,10 @@ def _compare_impl(val_dir: Path, answer_dir: Path) -> None: ...@@ -115,8 +118,10 @@ def _compare_impl(val_dir: Path, answer_dir: Path) -> None:
print('#' * 40) print('#' * 40)
print( print(
f"{val_file.name}: matched {matched_total} | val {val_total} | answer {answer_total} " f"{val_file.name}: matched {matched_total} | val {val_total} | answer {answer_total} "
f"| unmatched val {unmatched_val_count} | unmatched answer {unmatched_answer_count} | recall {matched_total / answer_total:.2%} | false_positive_rate {file_false_positive_rate:.2%}" f"| unmatched val {unmatched_val_count} | unmatched answer {unmatched_answer_count} | precision {file_precision:.2%} | recall {file_recall:.2%} | f1 {file_f1:.2%} | false_positive_rate {file_false_positive_rate:.2%}"
) )
import json
print(f'unmatched_val_by_item: {json.dumps(unmatched_val_by_item, ensure_ascii=False, indent=2)}')
for item in sorted(answer_counts): for item in sorted(answer_counts):
item_matches = matched_by_item.get(item, []) item_matches = matched_by_item.get(item, [])
print(f" 审查项 {item}: matched {len(item_matches)} / {answer_counts[item]}") print(f" 审查项 {item}: matched {len(item_matches)} / {answer_counts[item]}")
...@@ -136,10 +141,12 @@ def _compare_impl(val_dir: Path, answer_dir: Path) -> None: ...@@ -136,10 +141,12 @@ def _compare_impl(val_dir: Path, answer_dir: Path) -> None:
for t in uv: for t in uv:
print(f" val: {t}") print(f" val: {t}")
# break # only first file for demo # break # only first file for demo
precision = overall_matched / overall_val if overall_val else 0
recall = overall_matched / overall_answer if overall_answer else 0 recall = overall_matched / overall_answer if overall_answer else 0
f1 = (2 * precision * recall / (precision + recall)) if (precision + recall) else 0
overall_false_positive_rate = (overall_val - overall_matched) / overall_val if overall_val else 0 overall_false_positive_rate = (overall_val - overall_matched) / overall_val if overall_val else 0
print( print(
f"Overall: matched {overall_matched} | val {overall_val} | answer {overall_answer} | recall {recall:.2%} | false_positive_rate {overall_false_positive_rate:.2%}" f"Overall: matched {overall_matched} | val {overall_val} | answer {overall_answer} | precision {precision:.2%} | recall {recall:.2%} | f1 {f1:.2%}"
) )
# 按“审查项”的 overall 结果 # 按“审查项”的 overall 结果
...@@ -153,7 +160,9 @@ def _compare_impl(val_dir: Path, answer_dir: Path) -> None: ...@@ -153,7 +160,9 @@ def _compare_impl(val_dir: Path, answer_dir: Path) -> None:
mat = overall_item_matched.get(it, 0) mat = overall_item_matched.get(it, 0)
u_ans = overall_item_unmatched_answer.get(it, 0) u_ans = overall_item_unmatched_answer.get(it, 0)
u_val = overall_item_unmatched_val.get(it, 0) u_val = overall_item_unmatched_val.get(it, 0)
item_precision = (mat / (mat + u_val)) if (mat + u_val) else 0
acc = (mat / ans) if ans else 0 acc = (mat / ans) if ans else 0
item_f1 = (2 * item_precision * acc / (item_precision + acc)) if (item_precision + acc) else 0
item_false_positive_rate = u_val / (mat + u_val) if (mat + u_val) else 0 item_false_positive_rate = u_val / (mat + u_val) if (mat + u_val) else 0
rows_by_item.append({ rows_by_item.append({
"审查项": it, "审查项": it,
...@@ -161,16 +170,20 @@ def _compare_impl(val_dir: Path, answer_dir: Path) -> None: ...@@ -161,16 +170,20 @@ def _compare_impl(val_dir: Path, answer_dir: Path) -> None:
"合同所有不合格项": ans, "合同所有不合格项": ans,
"大模型其他不合格项": u_val, "大模型其他不合格项": u_val,
"大模型未匹配上的不合格项(C-B)": u_ans, "大模型未匹配上的不合格项(C-B)": u_ans,
"查准率(B/B+D)": item_precision,
"查全率(B/C)": acc, "查全率(B/C)": acc,
"F1": item_f1,
"误报率(D/B+D)": item_false_positive_rate, "误报率(D/B+D)": item_false_positive_rate,
}) })
print( print(
f" 审查项 {it}: matched {mat} / answer {ans} | unmatched val {u_val} | unmatched answer {u_ans} | recall {acc:.2%} | false_positive_rate {item_false_positive_rate:.2%}" f" 审查项 {it}: matched {mat} / answer {ans} | unmatched val {u_val} | unmatched answer {u_ans} | precision {item_precision:.2%} | recall {acc:.2%} | f1 {item_f1:.2%}"
) )
overall_by_item_df = pd.DataFrame(rows_by_item, columns=["审查项", "大模型匹配上的不合格项", "合同所有不合格项", "大模型其他不合格项", "大模型未匹配上的不合格项(C-B)", "查全率(B/C)", "误报率(D/B+D)"]) overall_by_item_df = pd.DataFrame(rows_by_item, columns=["审查项", "大模型匹配上的不合格项", "合同所有不合格项", "大模型其他不合格项", "大模型未匹配上的不合格项(C-B)", "查准率(B/B+D)", "查全率(B/C)", "F1", "误报率(D/B+D)"])
unmatched_val_total = sum(overall_item_unmatched_val.values()) unmatched_val_total = sum(overall_item_unmatched_val.values())
unmatched_answer_total = sum(overall_item_unmatched_answer.values()) unmatched_answer_total = sum(overall_item_unmatched_answer.values())
overall_precision = overall_matched / (overall_matched + unmatched_val_total) if (overall_matched + unmatched_val_total) else 0
overall_f1 = (2 * overall_precision * recall / (overall_precision + recall)) if (overall_precision + recall) else 0
overall_invalid_rate = unmatched_val_total / (overall_matched + unmatched_val_total) if (overall_matched + unmatched_val_total) else 0 overall_invalid_rate = unmatched_val_total / (overall_matched + unmatched_val_total) if (overall_matched + unmatched_val_total) else 0
overall_total_df = pd.DataFrame([ overall_total_df = pd.DataFrame([
{ {
...@@ -179,10 +192,12 @@ def _compare_impl(val_dir: Path, answer_dir: Path) -> None: ...@@ -179,10 +192,12 @@ def _compare_impl(val_dir: Path, answer_dir: Path) -> None:
"合同所有不合格项": overall_answer, "合同所有不合格项": overall_answer,
"大模型其他不合格项": unmatched_val_total, "大模型其他不合格项": unmatched_val_total,
"大模型未匹配上的不合格项(C-B)": unmatched_answer_total, "大模型未匹配上的不合格项(C-B)": unmatched_answer_total,
"查准率(B/B+D)": overall_precision,
"查全率(B/C)": recall, "查全率(B/C)": recall,
"F1": overall_f1,
"误报率(D/B+D)": overall_invalid_rate, "误报率(D/B+D)": overall_invalid_rate,
} }
], columns=["审查项", "大模型匹配上的不合格项", "合同所有不合格项", "大模型其他不合格项", "大模型未匹配上的不合格项(C-B)", "查全率(B/C)", "误报率(D/B+D)"]) ], columns=["审查项", "大模型匹配上的不合格项", "合同所有不合格项", "大模型其他不合格项", "大模型未匹配上的不合格项(C-B)", "查准率(B/B+D)", "查全率(B/C)", "F1", "误报率(D/B+D)"])
combined_df = pd.concat([overall_by_item_df, overall_total_df], ignore_index=True) combined_df = pd.concat([overall_by_item_df, overall_total_df], ignore_index=True)
compare_dir_name = val_dir.name compare_dir_name = val_dir.name
......
...@@ -13,146 +13,146 @@ from compare_annotation import compare_with_log ...@@ -13,146 +13,146 @@ from compare_annotation import compare_with_log
# Map raw comment authors to unified review item names. # Map raw comment authors to unified review item names.
COMMENT_AUTHOR_MAPPING: dict[str, str] = { COMMENT_AUTHOR_MAPPING: dict[str, str] = {
"三方货款审查":"第三方审查", "三方货款审查": "第三方审查",
"履行义务审查":"第三方审查", "履行义务审查": "第三方审查",
"违约条款审查":"违约与延期审查", "违约条款审查": "违约与延期审查",
"延期审查":"违约与延期审查" "延期审查": "违约与延期审查",
} }
def clean_illegal(value: object) -> object: def clean_illegal(value: object) -> object:
if isinstance(value, str): if isinstance(value, str):
return re.compile(r"[\x00-\x08\x0B-\x0C\x0E-\x1F]").sub("", value) return re.compile(r"[\x00-\x08\x0B-\x0C\x0E-\x1F]").sub("", value)
return value return value
def normalize_comment_author(author: str) -> str: def normalize_comment_author(author: str) -> str:
author = author.strip() author = author.strip()
if not author: if not author:
return author return author
return COMMENT_AUTHOR_MAPPING.get(author, author) return COMMENT_AUTHOR_MAPPING.get(author, author)
def extract_annotaion( def extract_annotaion(
datasets_dir: Path, datasets_dir: Path,
output_dir: Path, output_dir: Path,
strip_suffixes: list[str] | None = None, strip_suffixes: list[str] | None = None,
) -> None: ) -> None:
"""Extract review comments from Word files to xlsx files.""" """Extract review comments from Word files to xlsx files."""
datasets_dir = datasets_dir.resolve() datasets_dir = datasets_dir.resolve()
output_dir = output_dir.resolve() output_dir = output_dir.resolve()
output_dir.mkdir(parents=True, exist_ok=True) output_dir.mkdir(parents=True, exist_ok=True)
strip_suffixes = strip_suffixes or [] strip_suffixes = strip_suffixes or []
for item in sorted(datasets_dir.iterdir()): for item in sorted(datasets_dir.iterdir()):
if item.suffix.lower() == ".xlsx" or not item.is_file(): if item.suffix.lower() == ".xlsx" or not item.is_file():
continue continue
document = Document() document = Document()
document.LoadFromFile(str(item)) document.LoadFromFile(str(item))
comments: list[dict[str, str]] = [] comments: list[dict[str, str]] = []
for i in range(document.Comments.Count): for i in range(document.Comments.Count):
comment = document.Comments[i] comment = document.Comments[i]
comment_text = "" comment_text = ""
for j in range(comment.Body.Paragraphs.Count): for j in range(comment.Body.Paragraphs.Count):
paragraph = comment.Body.Paragraphs[j] paragraph = comment.Body.Paragraphs[j]
comment_text += paragraph.Text + "\n" comment_text += paragraph.Text + "\n"
comment_author = comment.Format.Author comment_author = comment.Format.Author
# 通过|作为分隔符,只拿以后的进行比对 # 通过|作为分隔符,只拿以后的进行比对
author_split_idx = comment_author.find("|") author_split_idx = comment_author.find("|")
comment_author = ( comment_author = (
comment_author[author_split_idx + 1 :] comment_author[author_split_idx + 1 :]
if author_split_idx != -1 if author_split_idx != -1
else comment_author else comment_author
) )
comment_author = normalize_comment_author(comment_author) comment_author = normalize_comment_author(comment_author)
comments.append( comments.append(
{ {
"审查项": clean_illegal(comment_author), "审查项": clean_illegal(comment_author),
"合同原文": clean_illegal(comment.OwnerParagraph.Text), "合同原文": clean_illegal(comment.OwnerParagraph.Text),
"建议": clean_illegal(comment_text), "建议": clean_illegal(comment_text),
} }
) )
df = pd.DataFrame(comments) df = pd.DataFrame(comments)
clean_stem = _strip_suffix_once(item.stem, strip_suffixes) clean_stem = _strip_suffix_once(item.stem, strip_suffixes)
output_stem = clean_stem or item.stem output_stem = clean_stem or item.stem
output_file = output_dir / f"{output_stem}.xlsx" output_file = output_dir / f"{output_stem}.xlsx"
df.to_excel(output_file, index=False) df.to_excel(output_file, index=False)
document.Close() document.Close()
def compare_annotaion(val_dir: Path, answer_dir: Path) -> None: def compare_annotaion(val_dir: Path, answer_dir: Path) -> None:
"""Run benchmark comparison on extracted annotations.""" """Run benchmark comparison on extracted annotations."""
log_path = compare_with_log(val_dir=val_dir, answer_dir=answer_dir) log_path = compare_with_log(val_dir=val_dir, answer_dir=answer_dir)
print(f"Compare log written to: {log_path}") print(f"Compare log written to: {log_path}")
def _strip_suffix_once(stem: str, suffixes: Iterable[str]) -> str: def _strip_suffix_once(stem: str, suffixes: Iterable[str]) -> str:
for suffix in suffixes: for suffix in suffixes:
if suffix and stem.endswith(suffix): if suffix and stem.endswith(suffix):
return stem[: -len(suffix)] return stem[: -len(suffix)]
return stem return stem
def eval( def eval(
datasets_dir: Path, datasets_dir: Path,
answer_dir: Path, answer_dir: Path,
val_dir: Path, val_dir: Path,
strip_suffixes: list[str] | None = None, strip_suffixes: list[str] | None = None,
) -> None: ) -> None:
"""Pipeline: extract annotations first, then compare against ground truth.""" """Pipeline: extract annotations first, then compare against ground truth."""
strip_suffixes = strip_suffixes or [] strip_suffixes = strip_suffixes or []
extract_annotaion( extract_annotaion(
datasets_dir=datasets_dir, datasets_dir=datasets_dir,
output_dir=val_dir, output_dir=val_dir,
strip_suffixes=strip_suffixes, strip_suffixes=strip_suffixes,
) )
compare_annotaion(val_dir=val_dir, answer_dir=answer_dir) compare_annotaion(val_dir=val_dir, answer_dir=answer_dir)
def _parse_args() -> argparse.Namespace: def _parse_args() -> argparse.Namespace:
base = Path(__file__).parent base = Path(__file__).parent
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
description="Extract review comments from docs and evaluate against answers." description="Extract review comments from docs and evaluate against answers."
) )
parser.add_argument( parser.add_argument(
"--datasets-dir", "--datasets-dir",
type=Path, type=Path,
default=base / "results" / "jp-output-lufa-simple-new", default=base / "results" / "jp-output-renji",
help="Directory containing Word files with annotations.", help="Directory containing Word files with annotations.",
) )
parser.add_argument( parser.add_argument(
"--answer-dir", "--answer-dir",
type=Path, type=Path,
default=base / "审查答案", default=base / "审查答案",
help="Directory containing labeled answer xlsx files.", help="Directory containing labeled answer xlsx files.",
) )
parser.add_argument( parser.add_argument(
"--val-dir", "--val-dir",
type=Path, type=Path,
default=base / "results" / "jp-output-lufa-simple-new-extracted", default=base / "results" / "jp-output-renji-extracted",
help="Directory to store extracted xlsx files for comparison.", help="Directory to store extracted xlsx files for comparison.",
) )
parser.add_argument( parser.add_argument(
"--strip-suffixes", "--strip-suffixes",
nargs="*", nargs="*",
default=['_麓发改进','_人机交互','_麓发迁移'], default=["_麓发改进", "_人机交互", "_麓发迁移"],
help=( help=(
"Optional filename suffixes to strip from generated val xlsx stems before " "Optional filename suffixes to strip from generated val xlsx stems before "
"comparison, e.g. --strip-suffixes _v1 _审阅版" "comparison, e.g. --strip-suffixes _v1 _审阅版"
), ),
) )
return parser.parse_args() return parser.parse_args()
if __name__ == "__main__": if __name__ == "__main__":
args = _parse_args() args = _parse_args()
eval( eval(
datasets_dir=args.datasets_dir, datasets_dir=args.datasets_dir,
answer_dir=args.answer_dir, answer_dir=args.answer_dir,
val_dir=args.val_dir, val_dir=args.val_dir,
strip_suffixes=args.strip_suffixes, strip_suffixes=args.strip_suffixes,
) )
No preview for this file type
import numpy as np for _ in range(input()):
try:
def calculate_grpo_advantages(rewards, epsilon=1e-8): eval(raw_input())
""" print("YES")
计算 GRPO 的组优势值 except TypeError:
:param rewards: 列表或数组,包含同一组样本的奖励值 print("NO")
:param epsilon: 稳定性系数,防止除以 0 except:
:return: 归一化后的优势值数组 print("NO")
"""
rewards = np.array(rewards)
# 1. 计算当前组的平均值
mean = np.mean(rewards)
# 2. 计算当前组的标准差
std = np.std(rewards)
# 3. 归一化计算优势
# 减去均值除以标准差,使得该组优势值满足均值为 0,标准差为 1
advantages = (rewards - mean) / (std + epsilon)
return advantages
# 示例数据
your_rewards = [1.1,1.1,1.1,1.1]
advantages = calculate_grpo_advantages(your_rewards)
print(f"原始奖励值: {your_rewards}")
print(f"GRPO 优势值: {advantages.round(4)}")
\ No newline at end of file
...@@ -20,7 +20,7 @@ class OpenAITool: ...@@ -20,7 +20,7 @@ class OpenAITool:
base_url=llm_config.base_url, api_key=llm_config.api_key base_url=llm_config.base_url, api_key=llm_config.api_key
) )
@retry(stop=stop_after_delay(600) | stop_after_attempt(1), wait=wait_fixed(1)) @retry(stop=stop_after_delay(600) | stop_after_attempt(3), wait=wait_fixed(1))
async def chat(self, msg, tools=None): async def chat(self, msg, tools=None):
if tools is None: if tools is None:
extra_body = None extra_body = None
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment