Commit d26c53e1 by ccran

feat: add readme.md

parent 5f18aa67
...@@ -7,5 +7,7 @@ ...@@ -7,5 +7,7 @@
# Keep Python source files # Keep Python source files
!**/*.py !**/*.py
!README.md
# Keep this file tracked # Keep this file tracked
!.gitignore !.gitignore
\ No newline at end of file
# 合同审查智能体 (Contract Review Agent)
一个基于 FastAPI 和大型语言模型 (LLM) 的智能合同审查系统,能够自动分析合同条款、识别风险并提供审查建议。
## 📋 项目概述
本项目是一个智能合同审查代理,通过以下流程实现合同自动化审查:
1. **文档解析** - 支持多种格式的合同文档解析
2. **分段处理** - 将合同按规则智能分段
3. **事实提取** - 从每个分段中提取与审查规则相关的客观事实
4. **规则审查** - 基于预设规则对提取的事实进行审查
5. **风险复核** - 对审查结果进行反思和复核
6. **结果合并** - 合并所有分段审查结果生成最终报告
## 🏗️ 项目结构
```
lufa-contract/
├── main.py # FastAPI 主应用入口
├── test.py # 测试脚本
├── core/ # 核心业务逻辑
│ ├── cache.py # 缓存管理
│ ├── config.py # 配置管理
│ ├── memory.py # 记忆/状态管理
│ ├── tool.py # 工具基类
│ └── tools/ # 具体工具实现
│ ├── segment_summary.py # 分段事实提取
│ ├── segment_review.py # 分段规则审查
│ ├── segment_rule_router.py # 规则路由
│ ├── retrieve_reference.py # 参考检索
│ ├── reflect_retry.py # 反思重试
│ └── segment_merger.py # 结果合并
├── data/ # 数据文件
│ ├── rules.xlsx # 审查规则表
│ ├── batch/ # 批量处理数据
│ └── benchmark/ # 基准测试数据
├── utils/ # 工具函数
│ ├── common_util.py # 通用工具
│ ├── http_util.py # HTTP 工具
│ └── doc_util.py # 文档工具
├── demo/ # 演示文件
├── tmp/ # 临时文件
└── .vscode/ # VSCode 配置
```
## 🔧 技术栈
- **后端框架**: FastAPI
- **LLM 服务**: Qwen2-72B-Instruct (可配置)
- **文档处理**: 支持 PDF、Word 等多种格式
- **日志**: Loguru
- **数据验证**: Pydantic
## 📦 核心功能
### 1. 分段事实提取 (SegmentSummary)
基于审查规则从合同分段中提取客观事实,确保:
- 事实可在原文中直接找到
- 不做抽象、概括或推断
- 不补充未出现的主体、条件或数值
### 2. 分段规则审查 (SegmentReview)
对提取的事实进行规则匹配和风险分析,输出:
- 风险等级 (H/M/L)
- 审查结论
- 修改建议
### 3. 反思重试 (ReflectRetry)
对审查结果进行自我反思,识别潜在问题并重试
### 4. 结果合并 (SegmentMerger)
合并所有分段的审查结果,生成完整的审查报告
## ⚙️ 配置说明
`core/config.py` 中可配置:
```python
# LLM 配置
LLMConfig:
base_url: "http://192.168.252.71:9002/v1"
model: "Qwen2-72B-Instruct"
# 审查规则集
ALL_RULESET_IDS = ["通用", "借款", "担保", "财务口", "金盘", "金盘简化"]
# 分段大小控制
MAX_SINGLE_CHUNK_SIZE = 5000
```
## 🚀 快速开始
### 1. 安装依赖
```bash
pip install fastapi uvicorn pydantic loguru
```
### 2. 启动服务
```bash
python main.py
```
服务将在 `http://localhost:8000` 启动
### 3. API 端点
- `POST /sleep` - 测试端点
- `POST /document/parse` - 解析合同文档
- `POST /contract/review` - 执行合同审查
- `GET /contract/{conversation_id}/result` - 获取审查结果
## 📝 使用示例
### 提交合同审查请求
```python
import requests
# 上传合同文档
response = requests.post(
"http://localhost:8000/document/parse",
json={
"conversation_id": "unique-conversation-id",
"file_url": "http://example.com/contract.pdf",
"ruleset_id": "通用"
}
)
# 获取审查结果
result = requests.get(
f"http://localhost:8000/contract/{response.json()['conversation_id']}/result"
)
```
## 🔐 安全说明
- API Key 配置在 `core/config.py`
- 支持内外网环境切换 (`use_lufa` 参数)
- 临时文件自动清理
## 📊 数据格式
### 审查结果结构
```json
{
"conversation_id": "xxx",
"findings": [
{
"segment_id": "seg_001",
"rule_id": "rule_001",
"risk_level": "H",
"fact": "提取的事实",
"conclusion": "审查结论",
"suggestion": "修改建议"
}
]
}
```
## 🛠️ 开发指南
### 添加新的审查规则
1.`data/rules.xlsx` 中添加新规则
2. 更新 `core/config.py` 中的规则集配置
3. 重启服务
### 自定义 LLM 模型
修改 `core/config.py` 中的 `LLMConfig`:
```python
LLMConfig:
base_url: "你的 LLM 服务地址"
model: "你的模型名称"
```
## 📄 许可证
内部使用,保留所有权利。
## 👥 维护者
- 开发团队
## 📞 联系方式
如有问题,请联系项目维护团队。
...@@ -9,7 +9,6 @@ from core.tool import ToolBase, tool, tool_func ...@@ -9,7 +9,6 @@ from core.tool import ToolBase, tool, tool_func
from utils.excel_util import ExcelUtil from utils.excel_util import ExcelUtil
@tool("retrieve_reference", "审查参考检索") @tool("retrieve_reference", "审查参考检索")
class RetrieveReferenceTool(ToolBase): class RetrieveReferenceTool(ToolBase):
def __init__(self) -> None: def __init__(self) -> None:
...@@ -22,12 +21,16 @@ class RetrieveReferenceTool(ToolBase): ...@@ -22,12 +21,16 @@ class RetrieveReferenceTool(ToolBase):
"triggers": "触发词", "triggers": "触发词",
"suggestion_template": "建议模板", "suggestion_template": "建议模板",
"case": "案例", "case": "案例",
"summary":"摘要项" "summary": "摘要项",
} }
rules_path = Path(__file__).resolve().parent.parent.parent / "data" / "rules.xlsx" rules_path = (
Path(__file__).resolve().parent.parent.parent / "data" / "rules.xlsx"
)
self.rulesets: Dict[str, List[Dict[str, Any]]] = {} self.rulesets: Dict[str, List[Dict[str, Any]]] = {}
for rs_id in ALL_RULESET_IDS: for rs_id in ALL_RULESET_IDS:
rules = ExcelUtil.load_mapped_excel(rules_path, sheet_name=rs_id, column_map=self.column_map) rules = ExcelUtil.load_mapped_excel(
rules_path, sheet_name=rs_id, column_map=self.column_map
)
self.rulesets[rs_id] = rules self.rulesets[rs_id] = rules
@tool_func( @tool_func(
...@@ -40,13 +43,21 @@ class RetrieveReferenceTool(ToolBase): ...@@ -40,13 +43,21 @@ class RetrieveReferenceTool(ToolBase):
"required": [], "required": [],
} }
) )
def run(self, ruleset_id: str = "", routed_rule_titles: List[str] | None = None) -> Dict[str, Any]: def run(
self, ruleset_id: str = "", routed_rule_titles: List[str] | None = None
) -> Dict[str, Any]:
target_ruleset_id = ruleset_id or self.default_ruleset_id target_ruleset_id = ruleset_id or self.default_ruleset_id
full_rules = self.rulesets.get(target_ruleset_id) or self.rulesets.get(self.default_ruleset_id, []) or [] full_rules = (
self.rulesets.get(target_ruleset_id)
or self.rulesets.get(self.default_ruleset_id, [])
or []
)
if routed_rule_titles is None: if routed_rule_titles is None:
rules = full_rules rules = full_rules
else: else:
title_set = {title for title in routed_rule_titles if isinstance(title, str)} title_set = {
title for title in routed_rule_titles if isinstance(title, str)
}
rules = [r for r in full_rules if r.get("title") in title_set] rules = [r for r in full_rules if r.get("title") in title_set]
return { return {
...@@ -59,6 +70,7 @@ class RetrieveReferenceTool(ToolBase): ...@@ -59,6 +70,7 @@ class RetrieveReferenceTool(ToolBase):
def summary_keywords(self, rules: List[Dict[str, Any]]) -> List[str]: def summary_keywords(self, rules: List[Dict[str, Any]]) -> List[str]:
return [r.get("summary", "") for r in rules if r.get("summary")] return [r.get("summary", "") for r in rules if r.get("summary")]
if __name__ == "__main__": if __name__ == "__main__":
tool = RetrieveReferenceTool() tool = RetrieveReferenceTool()
result = tool.run(ruleset_id="金盘", routed_rule_titles=None) result = tool.run(ruleset_id="金盘", routed_rule_titles=None)
...@@ -66,4 +78,4 @@ if __name__ == "__main__": ...@@ -66,4 +78,4 @@ if __name__ == "__main__":
print(f"Rule Title: {rule.get('title')}") print(f"Rule Title: {rule.get('title')}")
print(f"Case: {rule.get('case')}") print(f"Case: {rule.get('case')}")
print("-" * 20) print("-" * 20)
# print(result.get("total", 0)) # print(result.get("total", 0))
\ No newline at end of file
...@@ -3,7 +3,7 @@ import os ...@@ -3,7 +3,7 @@ import os
import re import re
import sys import sys
sys.path.append('../..') sys.path.append("../..")
import traceback import traceback
import concurrent.futures import concurrent.futures
...@@ -12,21 +12,21 @@ from loguru import logger ...@@ -12,21 +12,21 @@ from loguru import logger
from utils.common_util import random_str from utils.common_util import random_str
from utils.http_util import upload_file, fastgpt_openai_chat, download_file from utils.http_util import upload_file, fastgpt_openai_chat, download_file
# SUFFIX='_麓发迁移' SUFFIX = "_麓发迁移"
# batch_input_dir_path = 'jp-input' batch_input_dir_path = "jp-input"
# batch_output_dir_path = 'jp-output-lufa-new' batch_output_dir_path = "jp-output-lufa-new"
SUFFIX='_麓发' # SUFFIX = "_麓发"
batch_input_dir_path = 'lufa-input' # batch_input_dir_path = "lufa-input"
batch_output_dir_path = 'lufa-output' # batch_output_dir_path = "lufa-output"
batch_size = 5 batch_size = 5
# 麓发fastgpt接口 # 麓发fastgpt接口
url = 'http://192.168.252.71:18089/api/v1/chat/completions' # url = "http://192.168.252.71:18089/api/v1/chat/completions"
# 金盘fastgpt接口 # 金盘fastgpt接口
# url = 'http://192.168.252.71:18088/api/v1/chat/completions' url = "http://192.168.252.71:18088/api/v1/chat/completions"
# 麓发合同审查生产token # 麓发合同审查生产token
token = 'fastgpt-ek3Z6PxI6sXgYc0jxzZ5bVGqrxwM6aVyfSmA6JVErJYBMr2KmYxrHwEUOIMSYz' # token = "fastgpt-ek3Z6PxI6sXgYc0jxzZ5bVGqrxwM6aVyfSmA6JVErJYBMr2KmYxrHwEUOIMSYz"
# 金盘迁移麓发合同审查测试token # 金盘迁移麓发合同审查测试token
# token = 'fastgpt-vykT6qs07g7hR4tL2MNJE6DdNCIxaQjEu3Cxw9nuTBFg8MAG3CkByvnXKxSNEyMK7' token = "fastgpt-vykT6qs07g7hR4tL2MNJE6DdNCIxaQjEu3Cxw9nuTBFg8MAG3CkByvnXKxSNEyMK7"
# 人机交互测试(测试环境) # 人机交互测试(测试环境)
# token = 'fastgpt-p189K5zoTX5wjp0dBybFCwsbWm3juIwlJxt2wTGyiaOWOANI5Y10pKEZzyt' # token = 'fastgpt-p189K5zoTX5wjp0dBybFCwsbWm3juIwlJxt2wTGyiaOWOANI5Y10pKEZzyt'
# 人机交互测试(生产环境) # 人机交互测试(生产环境)
...@@ -34,9 +34,13 @@ token = 'fastgpt-ek3Z6PxI6sXgYc0jxzZ5bVGqrxwM6aVyfSmA6JVErJYBMr2KmYxrHwEUOIMSYz' ...@@ -34,9 +34,13 @@ token = 'fastgpt-ek3Z6PxI6sXgYc0jxzZ5bVGqrxwM6aVyfSmA6JVErJYBMr2KmYxrHwEUOIMSYz'
# 提取后审查测试 # 提取后审查测试
# token = 'fastgpt-n74gGX5ZqLT6o1ysMBSGUTjIciswYOWDRfQ75krMkE5gDVDkpzsbz8u' # token = 'fastgpt-n74gGX5ZqLT6o1ysMBSGUTjIciswYOWDRfQ75krMkE5gDVDkpzsbz8u'
def extract_url(text): def extract_url(text):
# \s * ([ ^ "\s]+?\.(?:docx?|pdf|xlsx)) # \s * ([ ^ "\s]+?\.(?:docx?|pdf|xlsx))
excel_p, doc_p = r'最终审查Excel\s*([^"]*xlsx)', r'最终审查批注\s*([^\" ]+?\.(?:docx?|pdf|wps))' excel_p, doc_p = (
r'最终审查Excel\s*([^"]*xlsx)',
r"最终审查批注\s*([^\" ]+?\.(?:docx?|pdf|wps))",
)
# 使用 re.search() 查找第一个匹配项 # 使用 re.search() 查找第一个匹配项
excel_m, doc_m = re.search(excel_p, text), re.search(doc_p, text) excel_m, doc_m = re.search(excel_p, text), re.search(doc_p, text)
if excel_m and doc_m: if excel_m and doc_m:
...@@ -46,7 +50,9 @@ def extract_url(text): ...@@ -46,7 +50,9 @@ def extract_url(text):
return None, None return None, None
def process_single_file(file, batch_input_dir_path, batch_output_dir_path, counter, start_file): def process_single_file(
file, batch_input_dir_path, batch_output_dir_path, counter, start_file
):
""" """
单文件处理逻辑,可被线程池并发调用 单文件处理逻辑,可被线程池并发调用
""" """
...@@ -55,29 +61,45 @@ def process_single_file(file, batch_input_dir_path, batch_output_dir_path, count ...@@ -55,29 +61,45 @@ def process_single_file(file, batch_input_dir_path, batch_output_dir_path, count
return return
# 提取文件前缀 # 提取文件前缀
file_name = file[:file.rfind('.')] file_name = file[: file.rfind(".")]
ext_name = file[file.rfind('.'):] ext_name = file[file.rfind(".") :]
# 源目标处理 # 源目标处理
original_file = f'{batch_input_dir_path}/{file}' original_file = f"{batch_input_dir_path}/{file}"
des_check_file = f'{batch_output_dir_path}/{file_name}.md' des_check_file = f"{batch_output_dir_path}/{file_name}.md"
des_excel_file = f'{batch_output_dir_path}/{file_name}{SUFFIX}.xlsx' des_excel_file = f"{batch_output_dir_path}/{file_name}{SUFFIX}.xlsx"
des_doc_file = f'{batch_output_dir_path}/{file_name}{SUFFIX}{ext_name}' des_doc_file = f"{batch_output_dir_path}/{file_name}{SUFFIX}{ext_name}"
try: try:
# 处理原文件 # 处理原文件
file_url = upload_file(original_file, input_url_to_inner=True).replace('218.77.58.8', '192.168.252.71') file_url = upload_file(original_file, input_url_to_inner=True).replace(
model = 'Qwen2-72B-Instruct' "218.77.58.8", "192.168.252.71"
)
model = "Qwen2-72B-Instruct"
# 合同审核Excel工作流处理 # 合同审核Excel工作流处理
logger.info(' 第{}个文件,处理文件: {}'.format(counter, original_file)) logger.info(" 第{}个文件,处理文件: {}".format(counter, original_file))
result = fastgpt_openai_chat(url, token, model, random_str(), file_url, f'测试批处理任务-{file_name}', False) result = fastgpt_openai_chat(
url,
token,
model,
random_str(),
file_url,
f"测试批处理任务-{file_name}",
False,
)
excel_url, doc_url = extract_url(result) excel_url, doc_url = extract_url(result)
if excel_url and doc_url: if excel_url and doc_url:
download_file(excel_url.replace('218.77.58.8', '192.168.252.71'), des_excel_file) download_file(
download_file(doc_url.replace('218.77.58.8', '192.168.252.71'), des_doc_file) excel_url.replace("218.77.58.8", "192.168.252.71"), des_excel_file
logger.info(f'第{counter}个文件下载:{excel_url}到{des_excel_file} {des_doc_file}') )
download_file(
doc_url.replace("218.77.58.8", "192.168.252.71"), des_doc_file
)
logger.info(
f"第{counter}个文件下载:{excel_url}到{des_excel_file} {des_doc_file}"
)
except Exception as e: except Exception as e:
logger.error(f'{original_file} 处理异常 第{counter}个文件: {e}') logger.error(f"{original_file} 处理异常 第{counter}个文件: {e}")
logger.error(traceback.print_exc()) logger.error(traceback.print_exc())
...@@ -103,5 +125,5 @@ def execute_batch(max_workers: int = 4): ...@@ -103,5 +125,5 @@ def execute_batch(max_workers: int = 4):
f.result() f.result()
if __name__ == '__main__': if __name__ == "__main__":
execute_batch(batch_size) execute_batch(batch_size)
\ No newline at end of file
...@@ -121,7 +121,7 @@ def _parse_args() -> argparse.Namespace: ...@@ -121,7 +121,7 @@ def _parse_args() -> argparse.Namespace:
parser.add_argument( parser.add_argument(
"--datasets-dir", "--datasets-dir",
type=Path, type=Path,
default=base / "results" / "jp-output-renji", default=base / "results" / "jp-output-lufa",
help="Directory containing Word files with annotations.", help="Directory containing Word files with annotations.",
) )
parser.add_argument( parser.add_argument(
...@@ -133,7 +133,7 @@ def _parse_args() -> argparse.Namespace: ...@@ -133,7 +133,7 @@ def _parse_args() -> argparse.Namespace:
parser.add_argument( parser.add_argument(
"--val-dir", "--val-dir",
type=Path, type=Path,
default=base / "results" / "jp-output-renji-extracted", default=base / "results" / "jp-output-lufa-extracted",
help="Directory to store extracted xlsx files for comparison.", help="Directory to store extracted xlsx files for comparison.",
) )
parser.add_argument( parser.add_argument(
......
No preview for this file type
...@@ -9,7 +9,9 @@ class DocBase(ABC): ...@@ -9,7 +9,9 @@ class DocBase(ABC):
self._doc_path = None self._doc_path = None
self._doc_name = None self._doc_name = None
self._kwargs = kwargs self._kwargs = kwargs
self._max_single_chunk_size = kwargs.get('max_single_chunk_size', MAX_SINGLE_CHUNK_SIZE) self._max_single_chunk_size = kwargs.get(
"max_single_chunk_size", MAX_SINGLE_CHUNK_SIZE
)
@abstractmethod @abstractmethod
def load(self, doc_path): def load(self, doc_path):
......
...@@ -509,10 +509,10 @@ class SpireWordDoc(DocBase): ...@@ -509,10 +509,10 @@ class SpireWordDoc(DocBase):
cell_list.append(cell_content) cell_list.append(cell_content)
# table_data += "|" + "|".join(cell_list) + "|" # table_data += "|" + "|".join(cell_list) + "|"
# table_data += "\n" # table_data += "\n"
table_data += ' '.join(cell_list) + '\n' table_data += " ".join(cell_list) + "\n"
if i == 0: if i == 0:
# table_data += "|" + "|".join(["--- " for _ in cell_list]) + "|\n" # table_data += "|" + "|".join(["--- " for _ in cell_list]) + "|\n"
table_data= ' '.join(cell_list) + '\n' table_data = " ".join(cell_list) + "\n"
return table_data return table_data
def get_chunk_info(self, chunk_id): def get_chunk_info(self, chunk_id):
...@@ -608,14 +608,18 @@ class SpireWordDoc(DocBase): ...@@ -608,14 +608,18 @@ class SpireWordDoc(DocBase):
return True return True
def _update_comment_content(self, comment_idx, suggest): def _update_comment_content(self, comment_idx, suggest):
self._doc.Comments.get_Item(comment_idx).Body.Paragraphs.get_Item(0).Text = suggest self._doc.Comments.get_Item(comment_idx).Body.Paragraphs.get_Item(
0
).Text = suggest
def _try_add_comment_in_paragraphs(self, paragraphs, target_text, author, suggest): def _try_add_comment_in_paragraphs(self, paragraphs, target_text, author, suggest):
if not target_text: if not target_text:
return False return False
for paragraph in paragraphs: for paragraph in paragraphs:
text_sel = paragraph.Find(target_text, False, True) text_sel = paragraph.Find(target_text, False, True)
if text_sel and self.set_comment_by_text_selection(text_sel, author, suggest): if text_sel and self.set_comment_by_text_selection(
text_sel, author, suggest
):
return True return True
return False return False
...@@ -767,8 +771,11 @@ class SpireWordDoc(DocBase): ...@@ -767,8 +771,11 @@ class SpireWordDoc(DocBase):
# update chunk_id # update chunk_id
comment_chunk_id = comment.get("chunk_id", -1) comment_chunk_id = comment.get("chunk_id", -1)
# 优先使用comments里提供的chunk_id,如果没有或无效则使用外部传入的chunk_id,如果都没有则异常处理 # 优先使用comments里提供的chunk_id,如果没有或无效则使用外部传入的chunk_id,如果都没有则异常处理
sub_chunks = self.get_sub_chunks(comment_chunk_id) if comment_chunk_id != -1 \ sub_chunks = (
and comment_chunk_id < self.get_chunk_num() else self.get_sub_chunks(chunk_id) self.get_sub_chunks(comment_chunk_id)
if comment_chunk_id != -1 and comment_chunk_id < self.get_chunk_num()
else self.get_sub_chunks(chunk_id)
)
author = self.format_comment_author(comment) author = self.format_comment_author(comment)
suggest = comment.get("suggest", "") suggest = comment.get("suggest", "")
find_key = comment["original_text"].strip() or comment["key_points"] find_key = comment["original_text"].strip() or comment["key_points"]
...@@ -808,7 +815,9 @@ class SpireWordDoc(DocBase): ...@@ -808,7 +815,9 @@ class SpireWordDoc(DocBase):
normalized_author = self._normalize_author_prefix(author) normalized_author = self._normalize_author_prefix(author)
for i in range(self._doc.Comments.Count): for i in range(self._doc.Comments.Count):
current_comment = self._doc.Comments.get_Item(i) current_comment = self._doc.Comments.get_Item(i)
comment_author = self._normalize_author_prefix(current_comment.Format.Author) comment_author = self._normalize_author_prefix(
current_comment.Format.Author
)
if comment_author == normalized_author: if comment_author == normalized_author:
return i return i
return None return None
...@@ -876,9 +885,7 @@ class SpireWordDoc(DocBase): ...@@ -876,9 +885,7 @@ class SpireWordDoc(DocBase):
if __name__ == "__main__": if __name__ == "__main__":
doc = SpireWordDoc() doc = SpireWordDoc()
doc.load( doc.load(r"/home/ccran/lufa-contract/demo/今麦郎合同审核.docx")
r"/home/ccran/lufa-contract/demo/今麦郎合同审核.docx"
)
print(doc._doc_name) print(doc._doc_name)
print("附件2《技术协议》" in doc.get_all_text()) print("附件2《技术协议》" in doc.get_all_text())
# doc.add_chunk_comment( # doc.add_chunk_comment(
...@@ -895,4 +902,4 @@ if __name__ == "__main__": ...@@ -895,4 +902,4 @@ if __name__ == "__main__":
# } # }
# ], # ],
# ) # )
# doc.to_file("/home/ccran/lufa-contract/demo/今麦郎合同审核_test.docx", True) # doc.to_file("/home/ccran/lufa-contract/demo/今麦郎合同审核_test.docx", True)
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment