Commit 49b473ef by ccran

feat: 新增摘要项;

parent e48b2cd5
......@@ -24,6 +24,7 @@ MERGE_RULE_PROMPT = False
MAX_SINGLE_CHUNK_SIZE = 5000
META_KEY = "META"
DEFAULT_RULESET_ID = "通用"
## 规则集ID列表,需与rules.xlsx中的sheet名称保持一致!!!
ALL_RULESET_IDS = [
"通用",
"借款",
......@@ -33,6 +34,7 @@ ALL_RULESET_IDS = [
"金盘简化",
"麓发测试",
"麓发标准",
"金盘B类"
]
MAX_WORKERS = 10
FILE_SUFFIX = "-审核批注"
......@@ -43,6 +45,7 @@ use_lufa = False
use_jp_machine = True
## 关键参数**
ocr_url = 'http://192.168.252.71:8202/openapi/ocrUploadFile'
if use_lufa:
outer_backend_url = "http://znkf.lgfzgroup.com:48081"
base_fastgpt_url = "http://192.168.252.71:18089"
......@@ -62,6 +65,7 @@ else:
outer_backend_url = "http://172.21.107.45:48080"
base_fastgpt_url = "http://172.21.107.45:3030"
base_backend_url = "http://172.21.107.45:48080"
ocr_url = "http://172.21.107.45:8202/openapi/ocrUploadFile"
segment_review_api_key = (
"fastgpt-vLu2JHAfqwEq5FUQhvATFDK0yDS6fs804v7KwWBMyU4sRrHzh4UGl89Zpa"
)
......
......@@ -347,13 +347,13 @@ class MemoryStore:
ws_facts.append(["元信息", "事实内容"])
for item in self.facts:
if not isinstance(item, dict):
ws_facts.append(["事实", json.dumps(item, ensure_ascii=False)])
ws_facts.append(["事实", self._format_summary_for_export(item)])
continue
meta_info = item.get(META_KEY, None)
ws_facts.append(
[
json.dumps(meta_info, ensure_ascii=False),
json.dumps(item, ensure_ascii=False),
self._format_summary_for_export(item),
]
)
else:
......@@ -443,6 +443,43 @@ class MemoryStore:
return safe[:31]
@staticmethod
def _format_summary_for_export(value: Any, level: int = 0) -> str:
indent = " " * level
if isinstance(value, dict):
lines: List[str] = []
for key, child in value.items():
if key == META_KEY:
continue
key_text = str(key)
if isinstance(child, (dict, list)):
lines.append(f"{indent}{key_text}:")
child_text = MemoryStore._format_summary_for_export(child, level + 1)
if child_text:
lines.append(child_text)
else:
lines.append(f"{indent}{key_text}:{MemoryStore._format_scalar(child)}")
return "\n".join(lines)
if isinstance(value, list):
lines = []
for item in value:
if isinstance(item, (dict, list)):
item_text = MemoryStore._format_summary_for_export(item, level)
if item_text:
lines.append(item_text)
else:
lines.append(f"{indent}{MemoryStore._format_scalar(item)}")
return "\n".join(lines)
return f"{indent}{MemoryStore._format_scalar(value)}"
@staticmethod
def _format_scalar(value: Any) -> str:
if value is None:
return ""
return str(value)
@staticmethod
def _normalize_finding_key(key: str) -> str:
normalized = (key or "").strip().lower()
if not normalized:
......
......@@ -19,9 +19,9 @@ class LLMTool(ToolBase):
self.system_prompt = system_prompt
self.llm = OpenAITool(LLM[llm_key], max_workers=MAX_WORKERS)
def build_messages(self, user_content: str) -> List[Dict[str, str]]:
def build_messages(self, user_content: str, system_content: str = None) -> List[Dict[str, str]]:
return [
{"role": "system", "content": self.system_prompt},
{"role": "system", "content": system_content or self.system_prompt},
{"role": "user", "content": user_content},
]
......
......@@ -2,7 +2,7 @@ from __future__ import annotations
import json
import re
from typing import Dict, List, Optional
from typing import Dict, List, Literal, Optional
from core.tool import tool, tool_func
from core.tools.segment_llm import LLMTool
......@@ -47,6 +47,46 @@ ROUTER_USER_PROMPT = """
"""
SUMMARY_ROUTER_SYSTEM_PROMPT = """
你是合同分段摘要项路由智能体(SegmentSummaryRouter)。
你的任务是:基于“当前分段文本”,从候选摘要项名称中选出当前分段应提取的摘要项。
【路由目标】
- 仅做摘要项适配判断,不输出事实摘要、不输出风险结论、不输出审查建议。
- 候选摘要项只有名称,没有规则正文、触发词或其他辅助信息。
- 高召回优先:只要当前分段明显包含某个摘要项所需的信息,就应路由命中。
- 若候选摘要项明显无关,则不要命中。
【判断依据】
- 以当前分段文本为主。
- 可参考上下文记忆辅助理解术语,但不得脱离当前分段文本做臆断。
【输出约束】
- 严格输出 JSON。
- 摘要项路由只输出命中的摘要项名称,不输出其他信息。
- 若确实没有任何相关摘要项,返回 {"selected_items": []}。
"""
SUMMARY_ROUTER_USER_PROMPT = """
【当前分段文本】
{segment_text}
【上下文记忆】
{context_memories_json}
【合同立场】
{party_role}
【候选摘要项名称】
{candidate_summaries_json}
【任务】
请从候选摘要项名称中选择当前分段应提取的摘要项,并输出 selected_items。
"""
ROUTER_OUTPUT_SCHEMA = """
```json
{
......@@ -61,6 +101,19 @@ ROUTER_OUTPUT_SCHEMA = """
"""
SUMMARY_ROUTER_OUTPUT_SCHEMA = """
```json
{
"selected_items": [
{
"name": "摘要项名称"
}
]
}
```
"""
@tool("segment_rule_router", "分段规则路由")
class SegmentRuleRouterTool(LLMTool):
def __init__(self) -> None:
......@@ -75,6 +128,7 @@ class SegmentRuleRouterTool(LLMTool):
"rules": {"type": "array", "items": {"type": "object"}},
"party_role": {"type": "string"},
"context_memories": {"type": "array"},
"route_by": {"type": "string"},
},
"required": ["segment_id", "segment_text", "rules", "party_role"],
}
......@@ -86,8 +140,24 @@ class SegmentRuleRouterTool(LLMTool):
rules: List[Dict],
party_role: str,
context_memories: Optional[List[Dict]] = None,
route_by: Literal["rule", "summary"] = "rule",
) -> Dict:
rules = rules or []
if route_by == "summary":
routed_summary_names = self._route_summaries(
segment_text=segment_text,
rules=rules,
party_role=party_role,
context_memories=context_memories,
)
return {
"segment_id": segment_id,
"route_by": route_by,
"routed_rules": [],
"routed_rule_titles": [],
"routed_summary_names": routed_summary_names,
}
routed_rules = self._route_rules(
segment_text=segment_text,
rules=rules,
......@@ -96,8 +166,10 @@ class SegmentRuleRouterTool(LLMTool):
)
return {
"segment_id": segment_id,
"route_by": route_by,
"routed_rules": routed_rules,
"routed_rule_titles": [r.get("title", "") for r in routed_rules],
"routed_summary_names": [],
}
def _build_candidate_rules(self, rules: List[Dict]) -> List[Dict]:
......@@ -105,6 +177,17 @@ class SegmentRuleRouterTool(LLMTool):
{r.get("title", ""): r.get("rule", "")} for r in rules if r.get("title")
]
def _build_candidate_summaries(self, rules: List[Dict]) -> List[str]:
summaries: List[str] = []
seen: set[str] = set()
for rule in rules:
summary = str(rule.get("summary", "")).strip()
if not summary or summary in seen:
continue
summaries.append(summary)
seen.add(summary)
return summaries
def _route_rules(
self,
segment_text: str,
......@@ -183,6 +266,66 @@ class SegmentRuleRouterTool(LLMTool):
)
return routed_rules
def _route_summaries(
self,
segment_text: str,
rules: List[Dict],
party_role: str,
context_memories: Optional[List[Dict]],
) -> List[str]:
candidates = self._build_candidate_summaries(rules)
if not candidates:
return []
user_content = (
SUMMARY_ROUTER_USER_PROMPT.format(
segment_text=segment_text,
context_memories_json=json.dumps(
context_memories or [], ensure_ascii=False
),
party_role=party_role,
candidate_summaries_json=json.dumps(candidates, ensure_ascii=False),
)
+ SUMMARY_ROUTER_OUTPUT_SCHEMA
)
llm_selected: List[Dict] = []
try:
resp = self.run_with_loop(
self.chat_async(
[
{"role": "system", "content": SUMMARY_ROUTER_SYSTEM_PROMPT},
{"role": "user", "content": user_content},
]
)
)
data = self.parse_first_json(resp)
llm_selected = data.get("selected_items", []) or []
except Exception:
llm_selected = []
selected_names = set()
for item in llm_selected:
name = self._selected_item_name(item)
if name:
selected_names.add(name)
direct_matched_names = {
name for name in candidates if name and name in (segment_text or "")
}
merged_names = selected_names | direct_matched_names
return [name for name in candidates if name in merged_names]
def _selected_item_name(self, item: Dict | str) -> str:
if isinstance(item, str):
return item.strip()
return str(
item.get("name")
or item.get("summary")
or item.get("summary_name")
or item.get("title")
or ""
).strip()
def _match_trigger_titles(self, segment_text: str, rules: List[Dict]) -> set[str]:
text = segment_text or ""
matched_titles: set[str] = set()
......
No preview for this file type
from datetime import datetime
from pathlib import Path
from typing import Any, Dict, List, Optional
from typing import Any, Dict, List, Literal, Optional
from uuid import uuid4
import ast
......@@ -121,6 +121,7 @@ class SegmentSummaryRequest(BaseModel):
party_role: Optional[str] = ""
ruleset_id: Optional[str] = "通用"
routed_rule_titles: Optional[List[str]] = None
routed_summary_names: Optional[List[str]] = None
file_ext: str
context_facts: Optional[Dict] = None
......@@ -151,10 +152,23 @@ def summarize_facts(payload: SegmentSummaryRequest) -> SegmentSummaryResponse:
)
ruleset_id = payload.ruleset_id or reference_tool.default_ruleset_id
rules = reference_tool.run(
ruleset_id=ruleset_id,
routed_rule_titles=payload.routed_rule_titles,
).get("rules", [])
if payload.routed_summary_names is not None:
summary_names = {
name.strip()
for name in payload.routed_summary_names
if isinstance(name, str) and name.strip()
}
all_rules = reference_tool.run(ruleset_id=ruleset_id).get("rules", [])
rules = [
rule
for rule in all_rules
if str(rule.get("summary", "")).strip() in summary_names
]
else:
rules = reference_tool.run(
ruleset_id=ruleset_id,
routed_rule_titles=payload.routed_rule_titles,
).get("rules", [])
result = summary_tool.run(
segment_id=segment_idx,
segment_text=segment_text,
......@@ -182,6 +196,9 @@ class SegmentReviewRequest(BaseModel):
routed_rule_titles: Optional[List[str]] = None
file_ext: str
context_memories: Optional[List[Dict]] = None
route_by: Literal["rule", "summary"] = Field(
default="rule", description="路由依据:rule=审查规则项,summary=摘要项"
)
class SegmentReviewResponse(BaseModel):
......@@ -195,7 +212,9 @@ class SegmentRuleRouterResponse(BaseModel):
conversation_id: str
segment_id: int
ruleset_id: str
route_by: Literal["rule", "summary"] = "rule"
routed_rule_titles: List[str]
routed_summary_names: List[str] = Field(default_factory=list)
routed_rules: List[Dict]
......@@ -311,13 +330,16 @@ def route_segment_rules(payload: SegmentReviewRequest) -> SegmentRuleRouterRespo
rules=rules,
party_role=payload.party_role or "",
context_memories=payload.context_memories,
route_by=payload.route_by,
)
return SegmentRuleRouterResponse(
conversation_id=payload.conversation_id,
segment_id=payload.segment_id,
ruleset_id=ruleset_id,
route_by=result.get("route_by", payload.route_by),
routed_rule_titles=result.get("routed_rule_titles", []),
routed_summary_names=result.get("routed_summary_names", []),
routed_rules=result.get("routed_rules", []),
)
......
......@@ -10,8 +10,7 @@ from utils.http_util import url_replace_fastgpt, download_file
from utils.common_util import random_str
from loguru import logger
import json
ocr_url = 'http://192.168.252.71:8202/openapi/ocrUploadFile'
from core.config import ocr_url
class OCRUtil:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment