Commit 447b8d67 by ccran

feat: 更新摘要写入;增加提取工作流;

parent d92c12a9
...@@ -12,6 +12,7 @@ ...@@ -12,6 +12,7 @@
!**/*.xlsx !**/*.xlsx
!**/*.pdf !**/*.pdf
!**/*.xls !**/*.xls
!workflow/**
!README.md !README.md
......
...@@ -397,6 +397,7 @@ class MemoryStore: ...@@ -397,6 +397,7 @@ class MemoryStore:
"""Export raw facts and merged facts to Excel, upload, then delete the local file.""" """Export raw facts and merged facts to Excel, upload, then delete the local file."""
try: try:
from openpyxl import Workbook # type: ignore from openpyxl import Workbook # type: ignore
from openpyxl.styles import Alignment # type: ignore
except ImportError as exc: except ImportError as exc:
raise ImportError( raise ImportError(
"openpyxl is required for export_facts_to_excel; install via 'pip install openpyxl'" "openpyxl is required for export_facts_to_excel; install via 'pip install openpyxl'"
...@@ -409,12 +410,16 @@ class MemoryStore: ...@@ -409,12 +410,16 @@ class MemoryStore:
with self._lock: with self._lock:
wb = Workbook() wb = Workbook()
ws_facts = wb.active ws_merge_facts = wb.active
ws_facts.title = self._safe_sheet_name("合同事实")
ws_merge_facts.title = self._safe_sheet_name("合并事实")
self._append_merge_facts_sheet(ws_merge_facts, self.merge_facts)
ws_facts = wb.create_sheet(self._safe_sheet_name("合同事实"))
self._append_facts_sheet(ws_facts, self.facts) self._append_facts_sheet(ws_facts, self.facts)
ws_merge_facts = wb.create_sheet(self._safe_sheet_name("合并事实")) self._format_content_column(ws_merge_facts, Alignment)
self._append_merge_facts_sheet(ws_merge_facts, self.merge_facts) self._format_content_column(ws_facts, Alignment)
wb.save(output_path) wb.save(output_path)
...@@ -533,6 +538,15 @@ class MemoryStore: ...@@ -533,6 +538,15 @@ class MemoryStore:
ws.append([summary_name, self._format_summary_for_export(value)]) ws.append([summary_name, self._format_summary_for_export(value)])
@staticmethod @staticmethod
def _format_content_column(ws: Any, alignment_cls: Any) -> None:
ws.column_dimensions["A"].width = 50
ws.column_dimensions["B"].width = 200
wrap_alignment = alignment_cls(wrap_text=True, vertical="top")
for cell in ws["B"]:
cell.alignment = wrap_alignment
ws.row_dimensions[cell.row].height = None
@staticmethod
def _format_summary_for_export(value: Any, level: int = 0) -> str: def _format_summary_for_export(value: Any, level: int = 0) -> str:
indent = " " * level indent = " " * level
if isinstance(value, dict): if isinstance(value, dict):
......
from __future__ import annotations
import json
from typing import Dict, List
from core.tool import tool, tool_func
from core.tools.segment_llm import LLMTool
from loguru import logger
AVAILABLE_RULESET_IDS = [
"合同信息提取(合同组)",
"合同信息提取(技术部)",
"合同信息提取(采购部)",
"技术协议提取(合同组)",
"技术协议提取(技术部)",
]
RULESET_ROUTER_SYSTEM_PROMPT = """
你是合同审查/信息提取系统的 ruleset_id 路由智能体。
你的任务是:根据用户的问题,从候选 ruleset_id 中选择最合适的一个。
选择原则:
1. 如果用户问题关注合同基础信息、商务条款、合同主体、金额、期限、付款、签署等合同文本信息,优先选择“合同信息提取”类规则集。
2. 如果用户问题关注技术参数、技术方案、设备规格、供货范围、验收标准、技术附件、技术协议等技术内容,优先选择“技术协议提取”类规则集。
3. 如果问题明确提到部门或使用方:
- 合同组、法务、合同管理 -> “(合同组)”
- 技术部、技术人员、技术评审 -> “(技术部)”
- 采购部、采购、供应商准入、采购流程 -> “(采购部)”
4. 如果没有明确部门:
- 合同信息提取默认选择“合同信息提取(合同组)”
- 技术协议提取默认选择“技术协议提取(技术部)”
5. 只能输出候选 ruleset_id 中的一个,不得编造。
严格输出 JSON。
"""
RULESET_ROUTER_USER_PROMPT = """
用户问题:
{question}
候选 ruleset_id:
{ruleset_ids}
请输出 JSON,格式如下:
```json
{{
"ruleset_id": "候选 ruleset_id 中的一个",
"reason": "简短说明选择原因"
}}
```
"""
@tool("ruleset_router", "根据用户问题选择 ruleset_id")
class RulesetRouterTool(LLMTool):
def __init__(self) -> None:
super().__init__(RULESET_ROUTER_SYSTEM_PROMPT)
@tool_func(
{
"type": "object",
"properties": {
"question": {"type": "string"},
},
"required": ["question"],
}
)
def run(self, question: str) -> Dict[str, str]:
normalized_question = (question or "").strip()
if not normalized_question:
return {
"ruleset_id": AVAILABLE_RULESET_IDS[0],
"reason": "用户问题为空,默认选择合同组合同信息提取规则集。",
}
user_content = RULESET_ROUTER_USER_PROMPT.format(
question=normalized_question,
ruleset_ids=json.dumps(AVAILABLE_RULESET_IDS, ensure_ascii=False),
)
try:
resp = self.run_with_loop(self.chat_async(self.build_messages(user_content)))
data = self.parse_first_json(resp)
ruleset_id = str(data.get("ruleset_id", "")).strip()
reason = str(data.get("reason", "")).strip()
if ruleset_id in AVAILABLE_RULESET_IDS:
return {"ruleset_id": ruleset_id, "reason": reason}
except Exception as exc:
logger.error("Ruleset router LLM failed, fallback to keyword route: %s", exc)
return self._fallback_route(normalized_question)
def _fallback_route(self, question: str) -> Dict[str, str]:
if any(keyword in question for keyword in ["技术协议", "技术", "参数", "规格", "验收标准", "供货范围"]):
if any(keyword in question for keyword in ["合同组", "法务", "合同管理"]):
return {
"ruleset_id": "技术协议提取(合同组)",
"reason": "问题涉及技术协议内容,并提到合同组相关场景。",
}
return {
"ruleset_id": "技术协议提取(技术部)",
"reason": "问题涉及技术协议或技术内容,默认选择技术部规则集。",
}
if any(keyword in question for keyword in ["采购部", "采购", "供应商"]):
return {
"ruleset_id": "合同信息提取(采购部)",
"reason": "问题涉及采购或供应商相关场景。",
}
if any(keyword in question for keyword in ["技术部", "技术人员", "技术评审"]):
return {
"ruleset_id": "合同信息提取(技术部)",
"reason": "问题涉及技术部相关场景。",
}
return {
"ruleset_id": "合同信息提取(合同组)",
"reason": "问题未明确部门或技术协议场景,默认选择合同组合同信息提取规则集。",
}
if __name__ == "__main__":
tool = RulesetRouterTool()
print(tool.run("帮我提取技术协议里的设备参数"))
No preview for this file type
...@@ -28,6 +28,7 @@ from core.tools.retrieve_reference import RetrieveReferenceTool ...@@ -28,6 +28,7 @@ from core.tools.retrieve_reference import RetrieveReferenceTool
from core.tools.reflect_retry import ReflectRetryTool from core.tools.reflect_retry import ReflectRetryTool
from core.tools.segment_merger import SegmentMergerTool from core.tools.segment_merger import SegmentMergerTool
from core.tools.fact_merger import FactMergerTool from core.tools.fact_merger import FactMergerTool
from core.tools.ruleset_router import RulesetRouterTool
from core.memory import Finding from core.memory import Finding
from core.memory import FINDING_KEY_MERGE, FINDING_KEY_REFLECT, FINDING_KEY_REVIEW from core.memory import FINDING_KEY_MERGE, FINDING_KEY_REFLECT, FINDING_KEY_REVIEW
...@@ -42,6 +43,7 @@ reference_tool = RetrieveReferenceTool() ...@@ -42,6 +43,7 @@ reference_tool = RetrieveReferenceTool()
reflect_tool = ReflectRetryTool() reflect_tool = ReflectRetryTool()
merger_tool = SegmentMergerTool() merger_tool = SegmentMergerTool()
fact_merger_tool = FactMergerTool() fact_merger_tool = FactMergerTool()
ruleset_router_tool = RulesetRouterTool()
@app.post("/sleep") @app.post("/sleep")
...@@ -72,6 +74,30 @@ class DocumentParseResponse(BaseModel): ...@@ -72,6 +74,30 @@ class DocumentParseResponse(BaseModel):
file_name: Optional[str] = None file_name: Optional[str] = None
class RulesetRouteRequest(BaseModel):
question: str = Field(..., description="User question used to select ruleset_id")
class RulesetRouteResponse(BaseModel):
question: str
ruleset_id: str
reason: str = ""
@app.post("/rulesets/route", response_model=RulesetRouteResponse)
def route_ruleset(payload: RulesetRouteRequest) -> RulesetRouteResponse:
question = (payload.question or "").strip()
if not question:
raise HTTPException(status_code=400, detail="question cannot be empty")
result = ruleset_router_tool.run(question)
return RulesetRouteResponse(
question=question,
ruleset_id=result.get("ruleset_id", ""),
reason=result.get("reason", ""),
)
@app.post("/documents/parse", response_model=DocumentParseResponse) @app.post("/documents/parse", response_model=DocumentParseResponse)
async def parse_document(payload: DocumentParseRequest) -> DocumentParseResponse: async def parse_document(payload: DocumentParseRequest) -> DocumentParseResponse:
if not payload.urls: if not payload.urls:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment