feat: add readme.md

d26c53e1 · ccran · 5f18aa67 · d26c53e1 · d26c53e1 · d26c53e1
Commit d26c53e1 authored Apr 01, 2026 by ccran
15 changed files
--- a/.gitignore
+++ b/.gitignore
@@ -7,5 +7,7 @@
 # Keep Python source files
 !**/*.py
+!README.md
 # Keep this file tracked
 !.gitignore
\ No newline at end of file
--- a/README.md
+++ b/README.md
+# 合同审查智能体 (Contract Review Agent)
+一个基于 FastAPI 和大型语言模型 (LLM) 的智能合同审查系统，能够自动分析合同条款、识别风险并提供审查建议。
+## 📋 项目概述
+本项目是一个智能合同审查代理，通过以下流程实现合同自动化审查：
+1. **文档解析** - 支持多种格式的合同文档解析
+2. **分段处理** - 将合同按规则智能分段
+3. **事实提取** - 从每个分段中提取与审查规则相关的客观事实
+4. **规则审查** - 基于预设规则对提取的事实进行审查
+5. **风险复核** - 对审查结果进行反思和复核
+6. **结果合并** - 合并所有分段审查结果生成最终报告
+## 🏗️ 项目结构
+```
+lufa-contract/
+├── main.py              # FastAPI 主应用入口
+├── test.py              # 测试脚本
+├── core/                # 核心业务逻辑
+│   ├── cache.py         # 缓存管理
+│   ├── config.py        # 配置管理
+│   ├── memory.py        # 记忆/状态管理
+│   ├── tool.py          # 工具基类
+│   └── tools/           # 具体工具实现
+│       ├── segment_summary.py      # 分段事实提取
+│       ├── segment_review.py       # 分段规则审查
+│       ├── segment_rule_router.py  # 规则路由
+│       ├── retrieve_reference.py   # 参考检索
+│       ├── reflect_retry.py        # 反思重试
+│       └── segment_merger.py       # 结果合并
+├── data/                # 数据文件
+│   ├── rules.xlsx       # 审查规则表
+│   ├── batch/           # 批量处理数据
+│   └── benchmark/       # 基准测试数据
+├── utils/               # 工具函数
+│   ├── common_util.py   # 通用工具
+│   ├── http_util.py     # HTTP 工具
+│   └── doc_util.py      # 文档工具
+├── demo/                # 演示文件
+├── tmp/                 # 临时文件
+└── .vscode/             # VSCode 配置
+```
+## 🔧 技术栈
+- **后端框架**: FastAPI
+- **LLM 服务**: Qwen2-72B-Instruct (可配置)
+- **文档处理**: 支持 PDF、Word 等多种格式
+- **日志**: Loguru
+- **数据验证**: Pydantic
+## 📦 核心功能
+### 1. 分段事实提取 (SegmentSummary)
+基于审查规则从合同分段中提取客观事实，确保：
+- 事实可在原文中直接找到
+- 不做抽象、概括或推断
+- 不补充未出现的主体、条件或数值
+### 2. 分段规则审查 (SegmentReview)
+对提取的事实进行规则匹配和风险分析，输出：
+- 风险等级 (H/M/L)
+- 审查结论
+- 修改建议
+### 3. 反思重试 (ReflectRetry)
+对审查结果进行自我反思，识别潜在问题并重试
+### 4. 结果合并 (SegmentMerger)
+合并所有分段的审查结果，生成完整的审查报告
+## ⚙️ 配置说明
+在 `core/config.py` 中可配置：
+```python
+# LLM 配置
+LLMConfig:
+  base_url: "http://192.168.252.71:9002/v1"
+  model: "Qwen2-72B-Instruct"
+# 审查规则集
+ALL_RULESET_IDS = ["通用", "借款", "担保", "财务口", "金盘", "金盘简化"]
+# 分段大小控制
+MAX_SINGLE_CHUNK_SIZE = 5000
+```
+## 🚀 快速开始
+### 1. 安装依赖
+```bash
+pip install fastapi uvicorn pydantic loguru
+```
+### 2. 启动服务
+```bash
+python main.py
+```
+服务将在 `http://localhost:8000` 启动
+### 3. API 端点
+- `POST /sleep` - 测试端点
+- `POST /document/parse` - 解析合同文档
+- `POST /contract/review` - 执行合同审查
+- `GET /contract/{conversation_id}/result` - 获取审查结果
+## 📝 使用示例
+### 提交合同审查请求
+```python
+import requests
+# 上传合同文档
+response = requests.post(
+    "http://localhost:8000/document/parse",
+    json={
+        "conversation_id": "unique-conversation-id",
+        "file_url": "http://example.com/contract.pdf",
+        "ruleset_id": "通用"
+    }
+)
+# 获取审查结果
+result = requests.get(
+    f"http://localhost:8000/contract/{response.json()['conversation_id']}/result"
+)
+```
+## 🔐 安全说明
+- API Key 配置在 `core/config.py` 中
+- 支持内外网环境切换 (`use_lufa` 参数)
+- 临时文件自动清理
+## 📊 数据格式
+### 审查结果结构
+```json
+{
+  "conversation_id": "xxx",
+  "findings": [
+    {
+      "segment_id": "seg_001",
+      "rule_id": "rule_001",
+      "risk_level": "H",
+      "fact": "提取的事实",
+      "conclusion": "审查结论",
+      "suggestion": "修改建议"
+    }
+  ]
+}
+```
+## 🛠️ 开发指南
+### 添加新的审查规则
+1. 在 `data/rules.xlsx` 中添加新规则
+2. 更新 `core/config.py` 中的规则集配置
+3. 重启服务
+### 自定义 LLM 模型
+修改 `core/config.py` 中的 `LLMConfig`:
+```python
+LLMConfig:
+  base_url: "你的 LLM 服务地址"
+  model: "你的模型名称"
+```
+## 📄 许可证
+内部使用，保留所有权利。
+## 👥 维护者
+- 开发团队
+## 📞 联系方式
+如有问题，请联系项目维护团队。
--- a/__pycache__/main.cpython-312.pyc
+++ b/__pycache__/main.cpython-312.pyc
--- a/core/__pycache__/config.cpython-312.pyc
+++ b/core/__pycache__/config.cpython-312.pyc
--- a/core/__pycache__/memory.cpython-312.pyc
+++ b/core/__pycache__/memory.cpython-312.pyc
--- a/core/memory.py
+++ b/core/memory.py
--- a/core/tools/retrieve_reference.py
+++ b/core/tools/retrieve_reference.py
@@ -9,7 +9,6 @@ from core.tool import ToolBase, tool, tool_func
 from utils.excel_util import ExcelUtil
 @tool("retrieve_reference", "审查参考检索")
 class RetrieveReferenceTool(ToolBase):
    def __init__(self) -> None:
@@ -22,12 +21,16 @@ class RetrieveReferenceTool(ToolBase):
            "triggers": "触发词",
            "suggestion_template": "建议模板",
            "case": "案例",
-            "summary":"摘要项"
+            "summary": "摘要项",
        }
-        rules_path = Path(__file__).resolve().parent.parent.parent / "data" / "rules.xlsx"
+        rules_path = (
+            Path(__file__).resolve().parent.parent.parent / "data" / "rules.xlsx"
+        )
        self.rulesets: Dict[str, List[Dict[str, Any]]] = {}
        for rs_id in ALL_RULESET_IDS:
-            rules = ExcelUtil.load_mapped_excel(rules_path, sheet_name=rs_id, column_map=self.column_map)
+            rules = ExcelUtil.load_mapped_excel(
+                rules_path, sheet_name=rs_id, column_map=self.column_map
+            )
            self.rulesets[rs_id] = rules
    @tool_func(
@@ -40,13 +43,21 @@ class RetrieveReferenceTool(ToolBase):
            "required": [],
        }
    )
-    def run(self, ruleset_id: str = "", routed_rule_titles: List[str] | None = None) -> Dict[str, Any]:
+    def run(
+        self, ruleset_id: str = "", routed_rule_titles: List[str] | None = None
+    ) -> Dict[str, Any]:
        target_ruleset_id = ruleset_id or self.default_ruleset_id
-        full_rules = self.rulesets.get(target_ruleset_id) or self.rulesets.get(self.default_ruleset_id, []) or []
+        full_rules = (
+            self.rulesets.get(target_ruleset_id)
+            or self.rulesets.get(self.default_ruleset_id, [])
+            or []
+        )
        if routed_rule_titles is None:
            rules = full_rules
        else:
-            title_set = {title for title in routed_rule_titles if isinstance(title, str)}
+            title_set = {
+                title for title in routed_rule_titles if isinstance(title, str)
+            }
            rules = [r for r in full_rules if r.get("title") in title_set]
        return {
@@ -59,6 +70,7 @@ class RetrieveReferenceTool(ToolBase):
    def summary_keywords(self, rules: List[Dict[str, Any]]) -> List[str]:
        return [r.get("summary", "") for r in rules if r.get("summary")]
 if __name__ == "__main__":
    tool = RetrieveReferenceTool()
    result = tool.run(ruleset_id="金盘", routed_rule_titles=None)
@@ -66,4 +78,4 @@ if __name__ == "__main__":
        print(f"Rule Title: {rule.get('title')}")
        print(f"Case: {rule.get('case')}")
        print("-" * 20)
    # print(result.get("total", 0))
\ No newline at end of file
--- a/data/batch/batch.py
+++ b/data/batch/batch.py
@@ -3,7 +3,7 @@ import os
 import re
 import sys
-sys.path.append('../..')
+sys.path.append("../..")
 import traceback
 import concurrent.futures
@@ -12,21 +12,21 @@ from loguru import logger
 from utils.common_util import random_str
 from utils.http_util import upload_file, fastgpt_openai_chat, download_file
-# SUFFIX='_麓发迁移'
+SUFFIX = "_麓发迁移"
-# batch_input_dir_path = 'jp-input'
+batch_input_dir_path = "jp-input"
-# batch_output_dir_path = 'jp-output-lufa-new'
+batch_output_dir_path = "jp-output-lufa-new"
-SUFFIX='_麓发'
+# SUFFIX = "_麓发"
-batch_input_dir_path = 'lufa-input'
+# batch_input_dir_path = "lufa-input"
-batch_output_dir_path = 'lufa-output'
+# batch_output_dir_path = "lufa-output"
 batch_size = 5
 # 麓发fastgpt接口
-url = 'http://192.168.252.71:18089/api/v1/chat/completions'
+# url = "http://192.168.252.71:18089/api/v1/chat/completions"
 # 金盘fastgpt接口
-# url = 'http://192.168.252.71:18088/api/v1/chat/completions'
+url = "http://192.168.252.71:18088/api/v1/chat/completions"
 # 麓发合同审查生产token
-token = 'fastgpt-ek3Z6PxI6sXgYc0jxzZ5bVGqrxwM6aVyfSmA6JVErJYBMr2KmYxrHwEUOIMSYz'
+# token = "fastgpt-ek3Z6PxI6sXgYc0jxzZ5bVGqrxwM6aVyfSmA6JVErJYBMr2KmYxrHwEUOIMSYz"
 # 金盘迁移麓发合同审查测试token
-# token = 'fastgpt-vykT6qs07g7hR4tL2MNJE6DdNCIxaQjEu3Cxw9nuTBFg8MAG3CkByvnXKxSNEyMK7'
+token = "fastgpt-vykT6qs07g7hR4tL2MNJE6DdNCIxaQjEu3Cxw9nuTBFg8MAG3CkByvnXKxSNEyMK7"
 # 人机交互测试（测试环境）
 # token = 'fastgpt-p189K5zoTX5wjp0dBybFCwsbWm3juIwlJxt2wTGyiaOWOANI5Y10pKEZzyt'
 # 人机交互测试（生产环境）
@@ -34,9 +34,13 @@ token = 'fastgpt-ek3Z6PxI6sXgYc0jxzZ5bVGqrxwM6aVyfSmA6JVErJYBMr2KmYxrHwEUOIMSYz'
 # 提取后审查测试
 # token = 'fastgpt-n74gGX5ZqLT6o1ysMBSGUTjIciswYOWDRfQ75krMkE5gDVDkpzsbz8u'
 def extract_url(text):
-    # \s * ([ ^ "\s]+?\.(?:docx?|pdf|xlsx)) 
+    # \s * ([ ^ "\s]+?\.(?:docx?|pdf|xlsx))
-    excel_p, doc_p = r'最终审查Excel\s*([^"]*xlsx)', r'最终审查批注\s*([^\" ]+?\.(?:docx?|pdf|wps))'
+    excel_p, doc_p = (
+        r'最终审查Excel\s*([^"]*xlsx)',
+        r"最终审查批注\s*([^\" ]+?\.(?:docx?|pdf|wps))",
+    )
    # 使用 re.search() 查找第一个匹配项
    excel_m, doc_m = re.search(excel_p, text), re.search(doc_p, text)
    if excel_m and doc_m:
@@ -46,7 +50,9 @@ def extract_url(text):
        return None, None
-def process_single_file(file, batch_input_dir_path, batch_output_dir_path, counter, start_file):
+def process_single_file(
+    file, batch_input_dir_path, batch_output_dir_path, counter, start_file
+):
    """
    单文件处理逻辑，可被线程池并发调用
    """
@@ -55,29 +61,45 @@ def process_single_file(file, batch_input_dir_path, batch_output_dir_path, count
        return
    # 提取文件前缀
-    file_name = file[:file.rfind('.')]
+    file_name = file[: file.rfind(".")]
-    ext_name = file[file.rfind('.'):]
+    ext_name = file[file.rfind(".") :]
    # 源目标处理
-    original_file = f'{batch_input_dir_path}/{file}'
+    original_file = f"{batch_input_dir_path}/{file}"
-    des_check_file = f'{batch_output_dir_path}/{file_name}.md'
+    des_check_file = f"{batch_output_dir_path}/{file_name}.md"
-    des_excel_file = f'{batch_output_dir_path}/{file_name}{SUFFIX}.xlsx'
+    des_excel_file = f"{batch_output_dir_path}/{file_name}{SUFFIX}.xlsx"
-    des_doc_file = f'{batch_output_dir_path}/{file_name}{SUFFIX}{ext_name}'
+    des_doc_file = f"{batch_output_dir_path}/{file_name}{SUFFIX}{ext_name}"
    try:
        # 处理原文件
-        file_url = upload_file(original_file, input_url_to_inner=True).replace('218.77.58.8', '192.168.252.71')
+        file_url = upload_file(original_file, input_url_to_inner=True).replace(
-        model = 'Qwen2-72B-Instruct'
+            "218.77.58.8", "192.168.252.71"
+        )
+        model = "Qwen2-72B-Instruct"
        # 合同审核Excel工作流处理
-        logger.info(' 第{}个文件,处理文件: {}'.format(counter, original_file))
+        logger.info(" 第{}个文件,处理文件: {}".format(counter, original_file))
-        result = fastgpt_openai_chat(url, token, model, random_str(), file_url, f'测试批处理任务-{file_name}', False)
+        result = fastgpt_openai_chat(
+            url,
+            token,
+            model,
+            random_str(),
+            file_url,
+            f"测试批处理任务-{file_name}",
+            False,
+        )
        excel_url, doc_url = extract_url(result)
        if excel_url and doc_url:
-            download_file(excel_url.replace('218.77.58.8', '192.168.252.71'), des_excel_file)
+            download_file(
-            download_file(doc_url.replace('218.77.58.8', '192.168.252.71'), des_doc_file)
+                excel_url.replace("218.77.58.8", "192.168.252.71"), des_excel_file
-            logger.info(f'第{counter}个文件下载:{excel_url}到{des_excel_file} {des_doc_file}')
+            )
+            download_file(
+                doc_url.replace("218.77.58.8", "192.168.252.71"), des_doc_file
+            )
+            logger.info(
+                f"第{counter}个文件下载:{excel_url}到{des_excel_file} {des_doc_file}"
+            )
    except Exception as e:
-        logger.error(f'{original_file} 处理异常 第{counter}个文件: {e}')
+        logger.error(f"{original_file} 处理异常 第{counter}个文件: {e}")
        logger.error(traceback.print_exc())
@@ -103,5 +125,5 @@ def execute_batch(max_workers: int = 4):
            f.result()
-if __name__ == '__main__':
+if __name__ == "__main__":
    execute_batch(batch_size)
\ No newline at end of file
--- a/data/benchmark/compare_annotation.py
+++ b/data/benchmark/compare_annotation.py
--- a/data/benchmark/eval.py
+++ b/data/benchmark/eval.py
@@ -121,7 +121,7 @@ def _parse_args() -> argparse.Namespace:
    parser.add_argument(
        "--datasets-dir",
        type=Path,
-        default=base / "results" / "jp-output-renji",
+        default=base / "results" / "jp-output-lufa",
        help="Directory containing Word files with annotations.",
    )
    parser.add_argument(
@@ -133,7 +133,7 @@ def _parse_args() -> argparse.Namespace:
    parser.add_argument(
        "--val-dir",
        type=Path,
-        default=base / "results" / "jp-output-renji-extracted",
+        default=base / "results" / "jp-output-lufa-extracted",
        help="Directory to store extracted xlsx files for comparison.",
    )
    parser.add_argument(

--- a/data/rules.xlsx
+++ b/data/rules.xlsx
--- a/utils/__pycache__/doc_util.cpython-312.pyc
+++ b/utils/__pycache__/doc_util.cpython-312.pyc
--- a/utils/__pycache__/spire_word_util.cpython-312.pyc
+++ b/utils/__pycache__/spire_word_util.cpython-312.pyc
--- a/utils/doc_util.py
+++ b/utils/doc_util.py
@@ -9,7 +9,9 @@ class DocBase(ABC):
        self._doc_path = None
        self._doc_name = None
        self._kwargs = kwargs
-        self._max_single_chunk_size = kwargs.get('max_single_chunk_size', MAX_SINGLE_CHUNK_SIZE)
+        self._max_single_chunk_size = kwargs.get(
+            "max_single_chunk_size", MAX_SINGLE_CHUNK_SIZE
+        )
    @abstractmethod
    def load(self, doc_path):

--- a/utils/spire_word_util.py
+++ b/utils/spire_word_util.py
@@ -509,10 +509,10 @@ class SpireWordDoc(DocBase):
                cell_list.append(cell_content)
            # table_data += "|" + "|".join(cell_list) + "|"
            # table_data += "\n"
-            table_data += ' '.join(cell_list) + '\n'
+            table_data += " ".join(cell_list) + "\n"
            if i == 0:
                # table_data += "|" + "|".join(["--- " for _ in cell_list]) + "|\n"
-                table_data= ' '.join(cell_list) + '\n'
+                table_data = " ".join(cell_list) + "\n"
        return table_data
    def get_chunk_info(self, chunk_id):
@@ -608,14 +608,18 @@ class SpireWordDoc(DocBase):
        return True
    def _update_comment_content(self, comment_idx, suggest):
-        self._doc.Comments.get_Item(comment_idx).Body.Paragraphs.get_Item(0).Text = suggest
+        self._doc.Comments.get_Item(comment_idx).Body.Paragraphs.get_Item(
+            0
+        ).Text = suggest
    def _try_add_comment_in_paragraphs(self, paragraphs, target_text, author, suggest):
        if not target_text:
            return False
        for paragraph in paragraphs:
            text_sel = paragraph.Find(target_text, False, True)
-            if text_sel and self.set_comment_by_text_selection(text_sel, author, suggest):
+            if text_sel and self.set_comment_by_text_selection(
+                text_sel, author, suggest
+            ):
                return True
        return False
@@ -767,8 +771,11 @@ class SpireWordDoc(DocBase):
            # update chunk_id
            comment_chunk_id = comment.get("chunk_id", -1)
            # 优先使用comments里提供的chunk_id，如果没有或无效则使用外部传入的chunk_id，如果都没有则异常处理
-            sub_chunks = self.get_sub_chunks(comment_chunk_id) if comment_chunk_id != -1 \
+            sub_chunks = (
-                and comment_chunk_id < self.get_chunk_num() else self.get_sub_chunks(chunk_id)
+                self.get_sub_chunks(comment_chunk_id)
+                if comment_chunk_id != -1 and comment_chunk_id < self.get_chunk_num()
+                else self.get_sub_chunks(chunk_id)
+            )
            author = self.format_comment_author(comment)
            suggest = comment.get("suggest", "")
            find_key = comment["original_text"].strip() or comment["key_points"]
@@ -808,7 +815,9 @@ class SpireWordDoc(DocBase):
        normalized_author = self._normalize_author_prefix(author)
        for i in range(self._doc.Comments.Count):
            current_comment = self._doc.Comments.get_Item(i)
-            comment_author = self._normalize_author_prefix(current_comment.Format.Author)
+            comment_author = self._normalize_author_prefix(
+                current_comment.Format.Author
+            )
            if comment_author == normalized_author:
                return i
        return None
@@ -876,9 +885,7 @@ class SpireWordDoc(DocBase):
 if __name__ == "__main__":
    doc = SpireWordDoc()
-    doc.load(
+    doc.load(r"/home/ccran/lufa-contract/demo/今麦郎合同审核.docx")
-        r"/home/ccran/lufa-contract/demo/今麦郎合同审核.docx"
-    )
    print(doc._doc_name)
    print("附件2《技术协议》" in doc.get_all_text())
    # doc.add_chunk_comment(
@@ -895,4 +902,4 @@ if __name__ == "__main__":
    #         }
    #     ],
    # )
    # doc.to_file("/home/ccran/lufa-contract/demo/今麦郎合同审核_test.docx", True)
\ No newline at end of file