Commit d92c12a9 by ccran

feat: 新增提取信息导出;

parent 49b473ef
......@@ -34,7 +34,12 @@ ALL_RULESET_IDS = [
"金盘简化",
"麓发测试",
"麓发标准",
"金盘B类"
"金盘B类",
"合同信息提取(合同组)",
"合同信息提取(技术部)",
"合同信息提取(采购部)",
"技术协议提取(合同组)",
"技术协议提取(技术部)",
]
MAX_WORKERS = 10
FILE_SUFFIX = "-审核批注"
......
......@@ -80,6 +80,7 @@ class MemoryStore:
self._storage_path.parent.mkdir(parents=True, exist_ok=True)
self._lock = RLock()
self.facts: List[Dict[str, Any]] = []
self.merge_facts: List[Dict[str, Any]] = []
self.findings: Dict[str, List[Finding]] = {}
self._load()
......@@ -100,6 +101,24 @@ class MemoryStore:
with self._lock:
return self.facts # deep copy
def set_merge_facts(
self, merge_facts: List[Dict[str, Any]]
) -> List[Dict[str, Any]]:
with self._lock:
self.merge_facts = merge_facts or []
self._persist()
return self.merge_facts
def add_merge_facts(self, partial: Dict[str, Any]) -> List[Dict[str, Any]]:
with self._lock:
self.merge_facts.append(partial)
self._persist()
return self.merge_facts
def get_merge_facts(self) -> List[Dict[str, Any]]:
with self._lock:
return self.merge_facts
def search_facts(self, keywords: List[str]) -> List[Any]:
keys = [str(k).strip().lower() for k in (keywords or []) if str(k).strip()]
if not keys:
......@@ -233,12 +252,14 @@ class MemoryStore:
def clear(self) -> None:
with self._lock:
self.facts.clear()
self.merge_facts.clear()
self.findings.clear()
self._persist()
def _persist(self) -> None:
payload = {
"facts": self.facts,
"merge_facts": self.merge_facts,
"findings": {
key: [asdict(f) for f in values]
for key, values in self.findings.items()
......@@ -259,6 +280,7 @@ class MemoryStore:
data = json.loads(raw or "{}")
if isinstance(data, dict):
self.facts = data.get("facts") or []
self.merge_facts = data.get("merge_facts") or []
loaded_findings = data.get("findings", {})
findings_map: Dict[str, List[Finding]] = {}
......@@ -371,6 +393,41 @@ class MemoryStore:
return res
def export_facts_to_excel(self, file_name: Optional[str] = None) -> Dict[str, Any]:
"""Export raw facts and merged facts to Excel, upload, then delete the local file."""
try:
from openpyxl import Workbook # type: ignore
except ImportError as exc:
raise ImportError(
"openpyxl is required for export_facts_to_excel; install via 'pip install openpyxl'"
) from exc
file_suffix = FILE_SUFFIX + datetime.now().strftime("%Y%m%d_%H%M%S")
name = file_name or "facts_export.xlsx"
name = Path(name).stem + file_suffix + ".xlsx"
output_path = Path(__file__).resolve().parent.parent / "tmp" / name
with self._lock:
wb = Workbook()
ws_facts = wb.active
ws_facts.title = self._safe_sheet_name("合同事实")
self._append_facts_sheet(ws_facts, self.facts)
ws_merge_facts = wb.create_sheet(self._safe_sheet_name("合并事实"))
self._append_merge_facts_sheet(ws_merge_facts, self.merge_facts)
wb.save(output_path)
try:
res = upload_file(str(output_path))
finally:
try:
output_path.unlink()
except Exception:
logger.warning("Failed to delete temp facts excel: %s", output_path)
return res
def export_findings_to_doc_comments(
self,
doc_obj: DocBase,
......@@ -442,6 +499,39 @@ class MemoryStore:
safe = safe.replace(ch, "_")
return safe[:31]
def _append_facts_sheet(self, ws: Any, facts: List[Dict[str, Any]]) -> None:
ws.append(["元信息", "事实内容"])
for item in facts or []:
if not isinstance(item, dict):
ws.append(["事实", self._format_summary_for_export(item)])
continue
meta_info = item.get(META_KEY, None)
ws.append(
[
json.dumps(meta_info, ensure_ascii=False),
self._format_summary_for_export(item),
]
)
def _append_merge_facts_sheet(
self, ws: Any, merge_facts: List[Dict[str, Any]]
) -> None:
ws.append(["提取项", "提取内容"])
rows: Dict[str, Any] = {}
for item in merge_facts or []:
if not isinstance(item, dict):
continue
for summary_name, value in item.items():
if summary_name == META_KEY:
continue
summary_text = str(summary_name or "").strip()
if not summary_text:
continue
rows[summary_text] = value
for summary_name, value in rows.items():
ws.append([summary_name, self._format_summary_for_export(value)])
@staticmethod
def _format_summary_for_export(value: Any, level: int = 0) -> str:
indent = " " * level
......
from __future__ import annotations
import json
from typing import Any, Dict, List, Optional
from core.config import META_KEY
from core.tool import tool, tool_func
from core.tools.segment_llm import LLMTool
from loguru import logger
FACT_MERGER_SYSTEM_PROMPT = """
你将收到同一个 summary_name 下,从多个合同分段提取出的 facts 列表。
你的任务是做事实合并,而不是审查或推断:
1. 仅基于输入 facts 合并信息,不得新增输入中不存在的事实。
2. 去除重复、空值和无意义的“未明确”信息;如果全部都是“未明确”,保留“未明确”。
3. 保留关键条件、金额、比例、期限、主体、触发条件等具体信息。
4. 当多个分段信息互补时,合并为结构化对象或对象列表。
5. 当信息存在明显冲突且无法判断真伪时,同时保留并标注为不同表述,不要自行裁决。
6. 严格输出 JSON。
"""
FACT_MERGER_USER_PROMPT = """
summary_name:
{summary_name}
facts:
{facts}
请输出 JSON,格式如下:
"""
OUTPUT_EXAMPLE = """
```json
{
"value": {
"合并后的字段": "合并后的事实"
}
}
```
"""
def _normalize_summary_names(summary_names: Optional[List[str]]) -> List[str]:
names: List[str] = []
seen = set()
for name in summary_names or []:
text = str(name or "").strip()
if not text or text in seen:
continue
seen.add(text)
names.append(text)
return names
def _is_empty_fact_value(value: Any) -> bool:
if value is None:
return True
if isinstance(value, str):
return not value.strip()
if isinstance(value, dict):
return not value
if isinstance(value, list):
return not value
return False
def _dedupe_values(values: List[Any]) -> List[Any]:
unique: List[Any] = []
seen = set()
for value in values:
if _is_empty_fact_value(value):
continue
try:
marker = json.dumps(value, ensure_ascii=False, sort_keys=True)
except TypeError:
marker = str(value)
if marker in seen:
continue
seen.add(marker)
unique.append(value)
return unique
def _extract_fact_values(
facts: List[Dict[str, Any]], summary_names: List[str]
) -> Dict[str, List[Any]]:
grouped = {name: [] for name in summary_names}
for item in facts or []:
if not isinstance(item, dict):
continue
for name in summary_names:
if name in item:
grouped[name].append(item.get(name))
return {name: _dedupe_values(values) for name, values in grouped.items()}
def _deterministic_merge(values: List[Any]) -> Any:
unique_values = _dedupe_values(values)
if not unique_values:
return {}
if len(unique_values) == 1:
return unique_values[0]
return unique_values
@tool("fact_merger", "同 summary_name facts 合并")
class FactMergerTool(LLMTool):
def __init__(self) -> None:
super().__init__(FACT_MERGER_SYSTEM_PROMPT)
@tool_func(
{
"type": "object",
"properties": {
"facts": {"type": "array", "items": {"type": "object"}},
"summary_names": {"type": "array", "items": {"type": "string"}},
"merge_mode": {
"type": "string",
"enum": ["llm", "rule"],
"default": "llm",
},
},
"required": ["facts", "summary_names"],
}
)
def run(
self,
facts: List[Dict[str, Any]],
summary_names: List[str],
merge_mode: str = "llm",
) -> Dict[str, Any]:
names = _normalize_summary_names(summary_names)
if not names:
return {"merge_facts": {}}
grouped = _extract_fact_values(facts, names)
merged: Dict[str, Any] = {}
for name in names:
values = grouped.get(name, [])
if not values:
continue
merged[name] = self._merge_values(name, values, merge_mode)
merged[META_KEY] = {
"summary_names": names,
"source_fact_count": len(facts or []),
}
return {"merge_facts": merged}
def _merge_values(self, summary_name: str, values: List[Any], merge_mode: str) -> Any:
unique_values = _dedupe_values(values)
if len(unique_values) <= 1:
return unique_values[0] if unique_values else {}
if str(merge_mode or "llm").lower() != "llm":
return _deterministic_merge(unique_values)
try:
return self._merge_values_with_llm(summary_name, unique_values)
except Exception as exc:
logger.error("LLM fact merge failed, fallback to rule merge: %s", exc)
return _deterministic_merge(unique_values)
def _merge_values_with_llm(self, summary_name: str, values: List[Any]) -> Any:
user_content = (
FACT_MERGER_USER_PROMPT.format(
summary_name=summary_name,
facts=json.dumps(values, ensure_ascii=False, indent=2),
)
+ OUTPUT_EXAMPLE
)
msgs = self.build_messages(user_content)
resp = self.run_with_loop(self.chat_async(msgs))
data = self.parse_first_json(resp)
if "value" not in data:
raise ValueError("missing value in fact merge response")
return data.get("value")
No preview for this file type
......@@ -27,6 +27,7 @@ from core.tools.rule_filter import LufaPartyRuleFilterTool
from core.tools.retrieve_reference import RetrieveReferenceTool
from core.tools.reflect_retry import ReflectRetryTool
from core.tools.segment_merger import SegmentMergerTool
from core.tools.fact_merger import FactMergerTool
from core.memory import Finding
from core.memory import FINDING_KEY_MERGE, FINDING_KEY_REFLECT, FINDING_KEY_REVIEW
......@@ -40,6 +41,7 @@ lufa_party_rule_filter_tool = LufaPartyRuleFilterTool()
reference_tool = RetrieveReferenceTool()
reflect_tool = ReflectRetryTool()
merger_tool = SegmentMergerTool()
fact_merger_tool = FactMergerTool()
@app.post("/sleep")
......@@ -64,6 +66,7 @@ class DocumentParseResponse(BaseModel):
conversation_id: str
segment_ids: List[int]
ruleset_items: List[str]
summary_names: List[str]
text: Optional[str] = None
file_ext: Optional[str] = None
file_name: Optional[str] = None
......@@ -102,11 +105,19 @@ async def parse_document(payload: DocumentParseRequest) -> DocumentParseResponse
for t in (r.get("title") for r in ruleset_items)
if isinstance(t, str) and t.strip()
]
summary_names = list(
dict.fromkeys(
s.strip()
for s in (r.get("summary") for r in ruleset_items)
if isinstance(s, str) and s.strip()
)
)
return DocumentParseResponse(
conversation_id=payload.conversation_id,
text=text,
segment_ids=segment_ids,
ruleset_items=ruleset_review_items,
summary_names=summary_names,
file_ext=file_ext,
file_name=Path(file_path).name,
)
......@@ -371,6 +382,18 @@ class MergerResponse(BaseModel):
merged_findings: List[Dict]
class FactsMergerRequest(BaseModel):
conversation_id: str
summary_names: List[str]
merge_mode: Literal["llm", "rule"] = "llm"
class FactsMergerResponse(BaseModel):
conversation_id: str
summary_names: List[str]
merge_facts: Dict
@app.post("/segments/review/reflect", response_model=ReflectReviewResponse)
def reflect_review(payload: ReflectReviewRequest) -> ReflectReviewResponse:
store = get_cached_memory(payload.conversation_id)
......@@ -470,6 +493,33 @@ def merge_segment_findings(payload: MergerRequest) -> MergerResponse:
)
@app.post("/segments/summary/facts/merger", response_model=FactsMergerResponse)
def merge_summary_facts(payload: FactsMergerRequest) -> FactsMergerResponse:
summary_names = [
name.strip()
for name in (payload.summary_names or [])
if isinstance(name, str) and name.strip()
]
if not summary_names:
raise HTTPException(status_code=400, detail="summary_names cannot be empty")
store = get_cached_memory(payload.conversation_id)
result = fact_merger_tool.run(
facts=store.get_facts(),
summary_names=summary_names,
merge_mode=payload.merge_mode,
)
merge_facts = result.get("merge_facts", {}) or {}
if merge_facts:
store.add_merge_facts(merge_facts)
return FactsMergerResponse(
conversation_id=payload.conversation_id,
summary_names=summary_names,
merge_facts=merge_facts,
)
########################################################################################################################
class ConversationResponse(BaseModel):
conversation_id: str
......@@ -532,6 +582,16 @@ class MemoryExportResponse(BaseModel):
doc_url: str
class FactsMemoryExportRequest(BaseModel):
conversation_id: str
file_name: Optional[str] = None
class FactsMemoryExportResponse(BaseModel):
conversation_id: str
excel_url: Any
@app.post("/memory/export", response_model=MemoryExportResponse)
def export_memory(payload: MemoryExportRequest) -> MemoryExportResponse:
store = get_cached_memory(payload.conversation_id)
......@@ -567,6 +627,22 @@ def export_memory(payload: MemoryExportRequest) -> MemoryExportResponse:
)
@app.post("/memory/facts/export", response_model=FactsMemoryExportResponse)
def export_facts_memory(payload: FactsMemoryExportRequest) -> FactsMemoryExportResponse:
store = get_cached_memory(payload.conversation_id)
try:
excel_res = store.export_facts_to_excel(file_name=payload.file_name)
except ImportError as exc:
raise HTTPException(status_code=500, detail=str(exc))
except Exception as exc:
raise HTTPException(status_code=500, detail=f"Export facts failed: {exc}")
return FactsMemoryExportResponse(
conversation_id=payload.conversation_id,
excel_url=excel_res,
)
if __name__ == "__main__":
from core.config import use_lufa
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment