Commit 05df4d3c by ccran

feat: update lufa prompt;

parent 58af8ced
......@@ -8,6 +8,7 @@ from core.config import pdf_support_formats
MAX_CACHE = 128
def _normalize_file_ext(file_ext: str) -> str:
if not file_ext:
raise ValueError("file_ext is required")
......@@ -24,6 +25,7 @@ def get_cached_doc_tool(conversation_id: str, file_ext: str) -> Tuple[DocBase, s
return SpirePdfDoc(), ext
return SpireWordDoc(), ext
@lru_cache(maxsize=MAX_CACHE)
def get_cached_memory(conversation_id: str) -> MemoryStore:
return MemoryStore(f'memory_store_{conversation_id}.json')
\ No newline at end of file
return MemoryStore(f"memory_store_{conversation_id}.json")
......@@ -5,6 +5,13 @@ from dataclasses import dataclass
use_docker = False
# @dataclass
# class LLMConfig:
# base_url: str = "https://api.deepseek.com/v1"
# api_key: str = "sk-3df81e63afe44ca39cbd7108d59bc91a"
# model: str = "deepseek-v4-pro"
@dataclass
class LLMConfig:
base_url: str = "http://192.168.252.71:9002/v1"
......@@ -17,10 +24,25 @@ MERGE_RULE_PROMPT = False
MAX_SINGLE_CHUNK_SIZE = 5000
META_KEY = "META"
DEFAULT_RULESET_ID = "通用"
ALL_RULESET_IDS = ["通用", "借款", "担保", "财务口", "金盘", "金盘简化", "麓发测试"]
ALL_RULESET_IDS = [
"通用",
"借款",
"担保",
"财务口",
"金盘",
"金盘简化",
"麓发测试",
"麓发标准",
]
MAX_WORKERS = 10
FILE_SUFFIX = "-审核批注"
## 关键参数**
use_non_fastgpt_llm = False
use_lufa = True
use_jp_machine = True
use_lufa = False
## 关键参数**
if use_lufa:
outer_backend_url = "http://znkf.lgfzgroup.com:48081"
base_fastgpt_url = "http://192.168.252.71:18089"
......@@ -32,9 +54,14 @@ if use_lufa:
"fastgpt-ao3al2vgfnArt9qi2bTpPeRHouCO7qngUZiQsIM1E2x91u22z65J"
)
else:
if not use_jp_machine:
outer_backend_url = "http://218.77.58.8:48080"
base_fastgpt_url = "http://192.168.252.71:18088"
base_backend_url = "http://192.168.252.71:48080"
else:
outer_backend_url = "http://172.21.107.45:48080"
base_fastgpt_url = "http://172.21.107.45:38080"
base_backend_url = "http://172.21.107.45:48080"
segment_review_api_key = (
"fastgpt-vLu2JHAfqwEq5FUQhvATFDK0yDS6fs804v7KwWBMyU4sRrHzh4UGl89Zpa"
)
......@@ -55,11 +82,19 @@ if use_docker:
root_path = "/app"
LLM = {
"base_tool_llm": LLMConfig(),
"fastgpt_segment_review": LLMConfig(
"fastgpt_segment_review": (
LLMConfig()
if use_non_fastgpt_llm
else LLMConfig(
base_url=f"{base_fastgpt_url}/api/v1", api_key=segment_review_api_key
)
),
"fastgpt_reflect_retry": LLMConfig(
"fastgpt_reflect_retry": (
LLMConfig()
if use_non_fastgpt_llm
else LLMConfig(
base_url=f"{base_fastgpt_url}/api/v1", api_key=reflect_retry_api_key
)
),
}
doc_support_formats = [".docx", ".doc", ".wps"]
......
......@@ -11,13 +11,12 @@ from uuid import uuid4
from utils.http_util import upload_file
from utils.doc_util import DocBase
from core.config import META_KEY
from core.config import META_KEY, FILE_SUFFIX, use_lufa
logger = logging.getLogger(__name__)
_ALLOWED_RISK_LEVELS = {"H", "M", "L", ""}
_ALLOWED_RISK_LEVELS = {"H", "M", "L", "H,M", ""}
FINDING_KEY_REVIEW = "review"
FINDING_KEY_REFLECT = "reflect"
FINDING_KEY_MERGE = "merge"
......@@ -290,9 +289,10 @@ class MemoryStore:
raise ImportError(
"openpyxl is required for export_to_excel; install via 'pip install openpyxl'"
) from exc
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
name = file_name or f"memory_export_{ts}.xlsx"
file_suffix = FILE_SUFFIX + datetime.now().strftime("%Y%m%d_%H%M%S")
name = file_name or f"memory_export.xlsx"
name = Path(name).stem + file_suffix + ".xlsx"
# print(f"Exporting to Excel with file name: {name}")
output_path = Path(__file__).resolve().parent.parent / "tmp" / name
with self._lock:
......@@ -381,13 +381,11 @@ class MemoryStore:
"""Add all findings as comments to a document, upload, then delete the local file."""
if doc_obj is None:
raise ValueError("doc_obj is required")
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
doc_name = getattr(doc_obj, "_doc_name", "") or ""
suffix = Path(doc_name).suffix or ".docx"
name = file_name or f"findings_{ts}{suffix}"
if not Path(name).suffix:
name = f"{name}{suffix}"
# build suffix
file_suffix = FILE_SUFFIX + datetime.now().strftime("%Y%m%d_%H%M%S")
# derive file name
name = file_name or getattr(doc_obj, "_doc_name", "") or "memory_export.docx"
name = Path(name).stem + file_suffix + (Path(name).suffix or ".docx")
output_path = Path(__file__).resolve().parent.parent / "tmp" / name
target_key = self._normalize_finding_key(finding_key)
......@@ -398,6 +396,7 @@ class MemoryStore:
for idx, f in enumerate(target_findings, start=1):
segment_id = int(f.segment_id or 0)
chunk_id = max(segment_id, 0)
if use_lufa:
suggest_parts = []
if f.risk_level:
suggest_parts.append(f"风险等级:{f.risk_level}")
......@@ -406,6 +405,8 @@ class MemoryStore:
if f.suggestion:
suggest_parts.append(f"建议:{f.suggestion}")
suggest_text = "\n".join(suggest_parts).strip()
else:
suggest_text = f"建议:{f.suggestion}".strip()
comments.append(
{
"id": str(idx),
......@@ -517,9 +518,11 @@ def test_memory_and_export_excel():
# print("Findings search:")
# for f in hits:
# print(json.dumps(asdict(f), ensure_ascii=False, indent=2))
print(store.export_to_excel())
print(store.export_to_excel("测试"))
if __name__ == "__main__":
# test_export_findings_to_doc_comments("/home/ccran/lufa-contract/tmp/股份转让协议.docx")
test_memory_and_export_excel()
test_export_findings_to_doc_comments(
"/home/ccran/lufa-contract/tmp/1_金盘箱变采购合同.docx"
)
# test_memory_and_export_excel()
from __future__ import annotations
from typing import Any, Dict, List
from core.tool import tool, tool_func
@tool("rule_filter", "规则过滤")
class RuleFilterTool:
@tool_func(
{
"type": "object",
"properties": {
"payload": {"type": "object"},
},
"required": ["payload"],
}
)
def run(self, payload: Dict[str, Any]) -> Dict[str, Any]:
raise NotImplementedError("Subclasses must implement run")
@tool("lufa_party_rule_filter_tool", "LUFA 当事人与支付主体规则过滤")
class LufaPartyRuleFilterTool(RuleFilterTool):
@tool_func(
{
"type": "object",
"properties": {
"payload": {"type": "object"},
},
"required": ["payload"],
}
)
def run(self, payload: Dict[str, Any]) -> Dict[str, Any]:
# rules = payload.get("rules") or []
# segment_idx = int(payload.get("segment_idx", 0))
# total_segments = int(payload.get("total_segments", 0))
# if not rules or total_segments <= 0:
# payload["rules"] = rules
# return payload
# # 奇数分段时将中间段归入前半段
# first_half_count = (total_segments + 1) // 2
# if segment_idx < first_half_count:
# filtered_rules: List[Dict[str, Any]] = [
# r for r in rules if "支付主体审查" not in str(r.get("title", ""))
# ]
# else:
# filtered_rules = [
# r for r in rules if "当事人审查" not in str(r.get("title", ""))
# ]
# payload["rules"] = filtered_rules
return payload
from __future__ import annotations
import difflib
import json
import re
import unicodedata
......@@ -11,7 +10,6 @@ from core.tools.segment_llm import LLMTool
from loguru import logger
import traceback
MERGER_SYSTEM_PROMPT = """
你将收到同一组 findings 的 issue 与 suggestion 列表,请做信息融合而非机械拼接。
......@@ -107,17 +105,17 @@ def _merge_text_union(base: str, other: str) -> str:
return f"{left}\n{right}"
def _has_substring_overlap(a: str, b: str, min_common_len: int = 8) -> bool:
def _has_substring_overlap(a: str, b: str, min_common_len: int = 5) -> bool:
left = _normalize_text_for_match(str(a or ""))
right = _normalize_text_for_match(str(b or ""))
if not left or not right:
return False
if left in right or right in left:
return True
match = difflib.SequenceMatcher(None, left, right).find_longest_match(
0, len(left), 0, len(right)
# Only treat edge overlap as related: suffix(left)->prefix(right) or reverse.
overlap_len = max(
_max_suffix_prefix_overlap(left, right),
_max_suffix_prefix_overlap(right, left),
)
return match.size >= min_common_len
return overlap_len >= min_common_len
def _normalize_text_for_match(text: str) -> str:
......@@ -225,7 +223,13 @@ def _rule_based_merge(
groups.append([findings[idx] for idx in group_idx])
return [_merge_group(group, field_merger=field_merger) for group in groups]
merged_findings: List[Dict[str, Any]] = []
for group in groups:
if _should_skip_group_merge(group):
merged_findings.extend([_normalize_finding(item) for item in group])
continue
merged_findings.append(_merge_group(group, field_merger=field_merger))
return merged_findings
def _deterministic_field_merge(group: List[Dict[str, Any]]) -> Dict[str, str]:
......@@ -237,6 +241,15 @@ def _deterministic_field_merge(group: List[Dict[str, Any]]) -> Dict[str, str]:
}
def _should_skip_group_merge(group: List[Dict[str, Any]]) -> bool:
if len(group) <= 1:
return True
for item in group:
if not str(item.get("original_text", "") or "").strip():
return True
return False
@tool("segment_merger", "同证据 findings 合并")
class SegmentMergerTool(LLMTool):
def __init__(self) -> None:
......@@ -321,30 +334,30 @@ if __name__ == "__main__":
tool = SegmentMergerTool()
sample = [
{
"rule_title": "支付时间审查",
"rule_title": "预付款审查",
"segment_id": 0,
"original_text": "本协议约定的服务内容全部履行完毕经甲方认可,在乙方提交检测数据后15个工作日内,乙方须向甲方提供相应数额的正规发票,甲方一次性支付合同总金额的100%给乙方。",
"issue": "付款条件缺乏实质把控。条款将付款绑定于提交数据,未明确'经甲方认可'的验收标准、期限及异议机制,且未设置质保金,存在验收流于形式即需全额付款的风险。",
"original_text": "丙方提交的下列单据经甲方、乙方审核无误后 20 个工作日内,支付该批次设备合同价格 70% 的到款项,合同生效后,丙方提供下列材料,甲方、乙方审核无误后20个工作日内支付给丙方本合同总价的10%的款项(¥458,000元,人民币大写肆拾伍万捌仟元整)作为预付款",
"issue": "根据审查规则,预付款比例应大于等于合同总价款的20%,或约定发货前付清全款。合同第3.2.1条约定预付款比例为合同总价的10%(458,000元),未达到公司规定的20%最低比例要求,且未约定发货前付清全款,存在资金占用风险。",
"risk_level": "H",
"suggestion": "修改为:'乙方提交报告后,甲方在X个工作日内验收。验收合格且收到发票后15个工作日内支付95%;剩余5%作为质保金,满X个月无异议后无息支付。若验收不合格,甲方有权拒付并要求整改。'",
"suggestion": "将预付款比例修改为合同总价的20%。建议修改为:“合同生效后,丙方提供下列材料,甲方、乙方审核无误后20个工作日内支付给丙方本合同总价的20%的款项(¥916,000元,人民币大写玖拾壹万陆仟元整)作为预付款”。",
"result": "不合格",
},
{
"rule_title": "发票审查",
"rule_title": "付款时间审查",
"segment_id": 0,
"original_text": "在乙方提交检测数据后15个工作日内,乙方须向甲方提供相应数额的正规发票,甲方一次性支付合同总金额的100%给乙方。甲方指定由其全资子公司长沙高新控股集团有限公司(简称高新控股)承担并支付本合同约定的检查服务费,乙方向高新控股开具相应金额的增值税专用发票。",
"issue": "缺失发票税率约定。条款明确了发票类型和开具时间,但未约定适用税率,违反审查规则,可能导致后续开票金额争议或税务合规风险。",
"original_text": "丙方按合同约定和交货通知单的要求交付合同设备后,现场经清点无误并验收合格,丙方提交的下列单据经甲方、乙方审核无误后 20 个工作日内,支付该批次设备合同价格 70% 的到款项",
"issue": "根据审查规则,发货后的付款项(如到货款)若未约定发货前全额付款,必须以“到货 XX 天/月”或相似表述作为充分条件之一,且若有多个条件需提及“先到为准”。当前条款约定支付“到货款”的条件仅为“现场经清点无误并验收合格”,属于以验收结果作为唯一触发条件,未设置“到货 XX 天”的闭口时间限制。若买方拖延验收,将导致卖方收款时间无限期延后,不符合规则要求。",
"risk_level": "H",
"suggestion": "补充税率约定。建议在'乙方向高新控股开具相应金额的增值税专用发票'后补充:'(税率:6%)'或根据实际业务类型补充具体税率数值。",
"suggestion": "修改为:丙方按合同约定和交货通知单的要求交付合同设备后,现场经清点无误并验收合格,或自货物到达现场之日起【30】日内(以先到者为准),丙方提交的下列单据经甲方、乙方审核无误后 20 个工作日内,支付该批次设备合同价格 70% 的到款项。",
"result": "不合格",
},
{
"rule_title": "主体审查",
"segment_id": 0,
"original_text": "委托方(甲方): 湖南麓谷发展集团有限公司... 甲方指定由其全资子公司长沙高新控股集团有限公司(简称高新控股)承担并支付本合同约定的检查服务费... 签章处:甲方:长沙高新控股集团有限公司",
"issue": "签约主体不一致。首部甲方为'湖南麓谷发展集团有限公司',但签章处及付款义务主体变更为'长沙高新控股集团有限公司',且未明确授权委托或变更确认条款,存在主体混同及履约风险。",
"original_text": "3.2.2 ...支付该批次设备合同价格70%的到款项; ... B.乙方开具给甲方、丙方开具给乙方的金额为该批次合同价格100%的增值税专用发票;",
"issue": "第3.2.2条约定到货款支付比例为70%,但条款要求开具金额为该批次合同价格100%的增值税专用发票。此时发票比例(100%)高于付款比例(70%)。根据审查规则,此类情况仅在合同中明确表述“货到”或“发货完成”时才为合格。虽然3.2.2条提到“交付合同设备后...验收合格”,但开票义务在3.3.1条中表述为无条件义务,未将“货到/发货完成”作为开具全额发票的强制前置条件,导致卖方在仅收到70%款项时需开具100%发票,不符合卖方利益。",
"risk_level": "H",
"suggestion": "统一合同主体名称。若确由子公司履约,应将首部及正文甲方统一修改为'长沙高新控股集团有限公司';若由母公司签约,应在签章处由母公司盖章,并补充'指定子公司代为履行付款义务'条款。",
"suggestion": '明确开票节点与付款节点的对应关系,修改为:"丙方应在合同设备发货后、买方支付到货款前,向乙方开具该批次设备金额100%的增值税专用发票。"',
"result": "不合格",
},
]
......
......@@ -14,6 +14,7 @@ from utils.common_util import random_str
from utils.http_util import upload_file, fastgpt_openai_chat, download_file
use_lufa = False
batch_size = 5
if not use_lufa:
SUFFIX = "_麓发迁移"
......@@ -26,13 +27,13 @@ if not use_lufa:
# 人机交互测试(测试环境)
# token = 'fastgpt-p189K5zoTX5wjp0dBybFCwsbWm3juIwlJxt2wTGyiaOWOANI5Y10pKEZzyt'
# 人机交互测试(生产环境)
# token = 'fastgpt-ry4jIjgNwmNgufMr5jR0ncvJVmSS4GZl4bx2ItsNPoncdQzW9Na3IP1Xrankr'
# token = "fastgpt-ry4jIjgNwmNgufMr5jR0ncvJVmSS4GZl4bx2ItsNPoncdQzW9Na3IP1Xrankr"
# 提取后审查测试
# token = 'fastgpt-n74gGX5ZqLT6o1ysMBSGUTjIciswYOWDRfQ75krMkE5gDVDkpzsbz8u'
else:
SUFFIX = "_麓发"
batch_input_dir_path = "lufa-input"
batch_output_dir_path = "lufa-output-standard"
batch_input_dir_path = "4.24测财务合同审核"
batch_output_dir_path = "4.24测财务合同审核-batch"
# 麓发fastgpt接口
url = "http://192.168.252.71:18089/api/v1/chat/completions"
# 麓发合同审查生产token
......@@ -41,9 +42,6 @@ else:
token = "fastgpt-mg5tQUgreJeF7peoOr5zqP0NR4EIrfS2bEVXge6FUL94Suu1TvEMR1sGNRSiV"
batch_size = 5
def extract_url(text):
# \s * ([ ^ "\s]+?\.(?:docx?|pdf|xlsx))
excel_p, doc_p = (
......
......@@ -328,7 +328,7 @@ def _parse_args() -> argparse.Namespace:
parser.add_argument(
"--val-dir",
type=Path,
default=base / "batch_output_0121_val",
default=base / "jp-output-rj-base",
help="Directory containing extracted val xlsx files.",
)
parser.add_argument(
......
......@@ -10,11 +10,11 @@ from spire.doc import Document
from compare_annotation import compare_with_log
# Map raw comment authors to unified review item names.
COMMENT_AUTHOR_MAPPING: dict[str, str] = {
"三方货款审查": "第三方审查",
"履行义务审查": "第三方审查",
"债务转移审查": "第三方审查",
"违约条款审查": "违约与延期审查",
"延期审查": "违约与延期审查",
}
......@@ -121,7 +121,7 @@ def _parse_args() -> argparse.Namespace:
parser.add_argument(
"--datasets-dir",
type=Path,
default=base / "results" / "jp-output-lufa-20260416-235546",
default=base / "results" / "jp-output-lufa-20260511-101828",
help="Directory containing Word files with annotations.",
)
parser.add_argument(
......
No preview for this file type
from spire.doc import *
from spire.doc.common import *
# 创建一个 Document 类对象并加载一个 Word 文档
doc = Document()
doc.LoadFromFile(
"/home/ccran/lufa-contract/demo/湖南麓谷发展集团“主数据管理系统与合同管理系统开发”项目合同协议书-审核批注20260511_153215.docx"
)
# 移除第二个注释
# doc.Comments.RemoveAt(1)
# 移除所有注释
doc.Comments.Clear()
# 保存文档
doc.SaveToFile(
"/home/ccran/lufa-contract/demo/湖南麓谷发展集团“主数据管理系统与合同管理系统开发”项目合同协议书-审核批注20260511_153215-无批注.docx"
)
doc.Close()
......@@ -14,10 +14,16 @@ from loguru import logger
from utils.common_util import extract_url_file, format_now
from utils.http_util import download_file
from core.cache import get_cached_doc_tool, get_cached_memory
from core.config import doc_support_formats, pdf_support_formats, MERGE_RULE_PROMPT
from core.config import (
doc_support_formats,
pdf_support_formats,
MERGE_RULE_PROMPT,
use_lufa,
)
from core.tools.segment_summary import SegmentSummaryTool
from core.tools.segment_review import SegmentReviewTool
from core.tools.segment_rule_router import SegmentRuleRouterTool
from core.tools.rule_filter import LufaPartyRuleFilterTool
from core.tools.retrieve_reference import RetrieveReferenceTool
from core.tools.reflect_retry import ReflectRetryTool
from core.tools.segment_merger import SegmentMergerTool
......@@ -30,6 +36,7 @@ TMP_DIR.mkdir(parents=True, exist_ok=True)
summary_tool = SegmentSummaryTool()
review_tool = SegmentReviewTool()
rule_router_tool = SegmentRuleRouterTool()
lufa_party_rule_filter_tool = LufaPartyRuleFilterTool()
reference_tool = RetrieveReferenceTool()
reflect_tool = ReflectRetryTool()
merger_tool = SegmentMergerTool()
......@@ -59,6 +66,7 @@ class DocumentParseResponse(BaseModel):
ruleset_items: List[str]
text: Optional[str] = None
file_ext: Optional[str] = None
file_name: Optional[str] = None
@app.post("/documents/parse", response_model=DocumentParseResponse)
......@@ -66,18 +74,13 @@ async def parse_document(payload: DocumentParseRequest) -> DocumentParseResponse
if not payload.urls:
raise HTTPException(status_code=400, detail="No URLs provided")
try:
support_formats = list(dict.fromkeys(doc_support_formats + pdf_support_formats))
filename = extract_url_file(payload.urls[0], support_formats)
except Exception as exc:
raise HTTPException(status_code=400, detail=f"Failed to parse url: {exc}")
file_path = str(TMP_DIR / filename)
try:
download_file(payload.urls[0], file_path)
file_path = download_file(payload.urls[0], TMP_DIR)
if not file_path:
raise RuntimeError("download returned empty path")
except Exception as exc:
raise HTTPException(status_code=500, detail=f"Download failed: {exc}")
# get doc tool
file_ext = payload.file_ext or Path(filename).suffix
file_ext = payload.file_ext or Path(file_path).suffix
try:
doc_obj, _ = get_cached_doc_tool(payload.conversation_id, file_ext)
except Exception as exc:
......@@ -105,6 +108,7 @@ async def parse_document(payload: DocumentParseRequest) -> DocumentParseResponse
segment_ids=segment_ids,
ruleset_items=ruleset_review_items,
file_ext=file_ext,
file_name=Path(file_path).name,
)
......@@ -285,6 +289,22 @@ def route_segment_rules(payload: SegmentReviewRequest) -> SegmentRuleRouterRespo
ruleset_id = payload.ruleset_id or reference_tool.default_ruleset_id
rules = reference_tool.run(ruleset_id=ruleset_id).get("rules", [])
if use_lufa and rules:
try:
total_segments = len(doc_obj.get_chunk_id_list() or [])
except Exception:
total_segments = 0
filtered_payload = lufa_party_rule_filter_tool.run(
{
"rules": rules,
"segment_idx": segment_idx,
"total_segments": total_segments,
}
)
rules = filtered_payload.get("rules", rules)
result = rule_router_tool.run(
segment_id=segment_idx,
segment_text=segment_text,
......@@ -508,7 +528,9 @@ def export_memory(payload: MemoryExportRequest) -> MemoryExportResponse:
try:
doc_res = store.export_findings_to_doc_comments(
doc_obj, finding_key=payload.finding_key or FINDING_KEY_REVIEW
doc_obj,
file_name=payload.file_name,
finding_key=payload.finding_key or FINDING_KEY_REVIEW,
)
except Exception as exc:
traceback.print_exc()
......
from main import FactsRetrieveRequest, retrieve_facts
from core.cache import get_cached_memory
import json
def test_retrieve_facts_direct() -> None:
conversation_id = "fa86563cb6c649d59e32e7def16ea6b2"
payload = FactsRetrieveRequest(
conversation_id=conversation_id,
keywords=["当事人"],
)
res = retrieve_facts(payload)
print(json.dumps(res.facts,ensure_ascii=False, indent=4))
if __name__ == "__main__":
test_retrieve_facts_direct()
......@@ -9,7 +9,7 @@ from core.config import max_chunk_page, min_single_chunk_size, max_single_chunk_
def random_str(l=5):
return ''.join(random.sample('abcdefghijklmnopqrstuvwxyz', l))
return "".join(random.sample("abcdefghijklmnopqrstuvwxyz", l))
def format_now():
......@@ -19,12 +19,14 @@ def format_now():
# 从url中提取文件名称
def extract_url_file(url, support_formats):
pattern = '|'.join([r'[\u4e00-\u9fa5()()0-9\w-]+' + format for format in support_formats])
pattern = "|".join(
[r"[\u4e00-\u9fa5()()0-9\w-]+" + format for format in support_formats]
)
search_result = re.search(pattern, url)
if search_result:
return search_result.group()
else:
raise Exception(f'{support_formats} not found in url:{url}')
raise Exception(f"{support_formats} not found in url:{url}")
# 调整单个页面数量
......@@ -34,7 +36,7 @@ def adjust_single_chunk_size(all_text_len):
# 从JSON字符串提取JSON对象
def extract_json(json_str:str) -> List[Dict]:
def extract_json(json_str: str) -> List[Dict]:
"""从字符串中提取 JSON 对象列表。
优先提取 ```json ... ``` 代码块;若不存在,尝试:
......@@ -43,12 +45,13 @@ def extract_json(json_str:str) -> List[Dict]:
- 从任意包含花括号/方括号的片段尝试解析
返回解析成功的 JSON 对象列表(数组会被展开)。
"""
def _try_parse_to_list(candidate: str, out_list: list) -> bool:
s = (candidate or '').strip()
s = (candidate or "").strip()
if not s:
return False
# 清理控制字符
s = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', '', s)
s = re.sub(r"[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]", "", s)
try:
obj = json_repair.loads(s, strict=False)
if isinstance(obj, list):
......@@ -63,26 +66,26 @@ def extract_json(json_str:str) -> List[Dict]:
results = []
# 1. 提取 ```json ... ``` 代码块
fenced_json_pattern = r'```json([\s\S]*?)```'
for match in re.findall(fenced_json_pattern, json_str or '', re.DOTALL):
fenced_json_pattern = r"```json([\s\S]*?)```"
for match in re.findall(fenced_json_pattern, json_str or "", re.DOTALL):
_try_parse_to_list(match, results)
if results:
return results
# 2. 尝试将全文解析为 JSON
if _try_parse_to_list(json_str or '', results):
if _try_parse_to_list(json_str or "", results):
return results
# 3. 提取普通 ``` ... ``` 代码块,尝试解析
fenced_any_pattern = r'```([\s\S]*?)```'
for match in re.findall(fenced_any_pattern, json_str or '', re.DOTALL):
fenced_any_pattern = r"```([\s\S]*?)```"
for match in re.findall(fenced_any_pattern, json_str or "", re.DOTALL):
if _try_parse_to_list(match, results):
return results
# 4. 从包含花括号/方括号的片段尝试解析(启发式,尽力而为)
bracket_pattern = r'(\{[\s\S]*?\}|\[[\s\S]*?\])'
for match in re.findall(bracket_pattern, json_str or '', re.DOTALL):
bracket_pattern = r"(\{[\s\S]*?\}|\[[\s\S]*?\])"
for match in re.findall(bracket_pattern, json_str or "", re.DOTALL):
_try_parse_to_list(match, results)
return results
......@@ -90,11 +93,7 @@ def extract_json(json_str:str) -> List[Dict]:
def remove_duplicates_by_key(data_list, key):
# 先按字符串长度从长到短排序
sorted_list = sorted(
data_list,
key=lambda x: len(x.get(key, "")),
reverse=True
)
sorted_list = sorted(data_list, key=lambda x: len(x.get(key, "")), reverse=True)
result = []
seen_strings = []
......@@ -109,12 +108,14 @@ def remove_duplicates_by_key(data_list, key):
def extract_drop_json_part(json_str):
json_pattern = r'```json([\s\S]*?)```'
non_json_content = re.sub(json_pattern, '', json_str, re.DOTALL)
json_pattern = r"```json([\s\S]*?)```"
non_json_content = re.sub(json_pattern, "", json_str, re.DOTALL)
return non_json_content.strip()
def group_chunk_by_len(chunk_list: List[Dict], key: str, chunk_len: int) -> List[List[Dict]]:
def group_chunk_by_len(
chunk_list: List[Dict], key: str, chunk_len: int
) -> List[List[Dict]]:
ret_chunk_list = []
sub_chunk_list = []
current_acc_len = 0 # 用于记录当前 sub_chunk 的累积长度
......@@ -144,7 +145,10 @@ def group_chunk_by_len(chunk_list: List[Dict], key: str, chunk_len: int) -> List
return ret_chunk_list
if __name__ == '__main__':
if __name__ == "__main__":
json_str = '```json{"segment_id": "seg-001"}```'
print(extract_json(json_str))
url = "/api/common/file/read/今麦郎合同审核.docx?token=eyJhbGciOiJ.kpXVCJ9.1xfdsa"
print(extract_url_file(url, [".docx", ".doc", ".wps"]))
pass
import json
import os
import re
from pathlib import Path
from urllib.parse import unquote, urlparse
import requests
from loguru import logger
......@@ -82,6 +85,26 @@ def upload_file(path, input_url_to_inner=True, output_url_to_inner=False) -> str
raise Exception(f"上传{path}失败 Response text: {response.text}")
def _resolve_download_filename(url: str, response: requests.Response) -> str:
content_disposition = response.headers.get("content-disposition", "")
if content_disposition:
match = re.search(
r"filename\*=(?:UTF-8''|utf-8'')?([^;]+)", content_disposition
)
if match:
return unquote(match.group(1).strip().strip('"'))
match = re.search(r'filename="?([^";]+)"?', content_disposition)
if match:
return unquote(match.group(1).strip().strip('"'))
url_filename = Path(urlparse(url).path).name
if url_filename:
return url_filename
return "downloaded_file"
# 下载url到本地path
def download_file(url, path, input_url_to_inner=True):
if not url.startswith("http:"):
......@@ -92,13 +115,21 @@ def download_file(url, path, input_url_to_inner=True):
response = requests.get(url)
# 确保请求成功
if response.status_code == 200:
target_path = Path(path)
if target_path.exists() and target_path.is_dir():
target_path = target_path / _resolve_download_filename(url, response)
target_path.parent.mkdir(parents=True, exist_ok=True)
# 打开本地文件,准备写入数据
with open(path, "wb") as f:
with open(target_path, "wb") as f:
# 写入响应的内容
f.write(response.content)
logger.info(f"{url}文件下载成功,保存到{path}")
logger.info(f"{url}文件下载成功,保存到{target_path}")
return str(target_path)
else:
logger.error(f"{url}文件下载失败. HTTP Status Code: {response.status_code}")
return None
def url_replace_fastgpt(origin: str):
......
......@@ -4,14 +4,7 @@ from openai import AsyncOpenAI
from dataclasses import dataclass
from tenacity import retry, stop_after_attempt, stop_after_delay, wait_fixed
import asyncio
from core.config import MAX_WORKERS
@dataclass
class LLMConfig:
base_url: str
api_key: str
model: str
from core.config import MAX_WORKERS, LLMConfig, use_non_fastgpt_llm
class OpenAITool:
......@@ -25,10 +18,15 @@ class OpenAITool:
@retry(stop=stop_after_delay(600) | stop_after_attempt(3), wait=wait_fixed(1))
async def chat(self, msg, tools=None):
if tools is None:
extra_body = None
extra_body = {}
# fastgpt专用:如果第一个消息是system角色,则将其内容放入extra_body.variables.system,并从消息列表中移除
if not use_non_fastgpt_llm:
if msg[0]["role"] == "system":
extra_body = {"variables": {"system": msg[0]["content"]}}
msg = msg[1:]
# deepseek专用关闭思考
extra_body["thinking"] = {"type": "disabled"}
try:
response = await self.client.chat.completions.create(
model=self.llm_config.model, messages=msg, extra_body=extra_body
)
......@@ -36,6 +34,9 @@ class OpenAITool:
reasoning_content = response.choices[0].message.model_extra.get(
"reasoning_content", ""
)
except Exception as e:
logger.error(f"LLM调用失败: {e} | response: {response}")
raise e
return content
else:
response = await self.client.chat.completions.create(
......@@ -54,3 +55,19 @@ class OpenAITool:
return await self.chat(m, tools)
return await asyncio.gather(*[_wrapped(m) for m in msgs])
if __name__ == "__main__":
import json
llm_config = LLMConfig()
tool = OpenAITool(llm_config)
messages = [
{
"role": "system",
"content": "你是我的人工智能助手,协助我分析问题并提供建议。",
},
{"role": "user", "content": "请分析以下问题:为什么天空是蓝色的?"},
]
response = asyncio.run(tool.chat(messages))
print("LLM Response:", response)
......@@ -4,6 +4,7 @@ import re
from thefuzz import fuzz
from utils.doc_util import DocBase
from utils.common_util import adjust_single_chunk_size
from core.config import use_lufa
import os
......@@ -713,6 +714,40 @@ class SpireWordDoc(DocBase):
text_range = text_sel.GetAsOneRange()
return self._insert_comment_by_text_range(text_range, author, comment_content)
def add_comment_to_first_paragraph(
self, comment_text, author="审阅助手", target_text=None
):
"""
将批注直接添加到第一节第一个段落。
保留 target_text 参数仅为兼容旧调用。
"""
self._ensure_loaded()
if self._doc.Sections.Count == 0:
logger.error("文档中未找到任何节,无法添加批注")
return False
section = self._doc.Sections.get_Item(0)
if section.Paragraphs.Count == 0:
logger.error("第一节未找到段落,无法添加批注")
return False
paragraph = section.Paragraphs.get_Item(0)
comment = Comment(self._doc)
comment.Body.AddParagraph().Text = comment_text
comment.Format.Author = author
paragraph.ChildObjects.Add(comment)
comment_start = CommentMark(self._doc, CommentMarkType.CommentStart)
comment_end = CommentMark(self._doc, CommentMarkType.CommentEnd)
comment_start.CommentId = comment.Format.CommentId
comment_end.CommentId = comment.Format.CommentId
paragraph.ChildObjects.Insert(0, comment_start)
paragraph.ChildObjects.Add(comment_end)
return True
# 设置chunk批注
def add_table_comment(
self, table, target_text, comment_text, author="审阅助手", initials="AI"
......@@ -778,6 +813,25 @@ class SpireWordDoc(DocBase):
)
author = self.format_comment_author(comment)
suggest = comment.get("suggest", "")
original_text = (comment.get("original_text") or "").strip()
# original_text 为空时,直接落在文档首个可用段落。
if not original_text:
existing_comment_idx = self.find_comment(author)
if existing_comment_idx is not None:
self._update_comment_content(existing_comment_idx, suggest)
continue
first_para_author = self._decorate_author_with_match_type(
author, "exact"
)
matched = self.add_comment_to_first_paragraph(
suggest, first_para_author
)
if not matched:
logger.error("original_text 为空,且未能在首段落添加批注")
continue
find_key = comment["original_text"].strip() or comment["key_points"]
# 先检查是否已有同一“规则ID|要点”的批注,避免重复插入。
......@@ -868,6 +922,18 @@ class SpireWordDoc(DocBase):
def to_file(self, path, remove_prefix=False):
self._ensure_loaded()
# watermark_text = (
# "Evaluation Warning: The document was created with Spire.Doc for Python."
# )
# if self._doc.Sections.Count > 0:
# section = self._doc.Sections.get_Item(0)
# if section.Paragraphs.Count > 0:
# first_paragraph = section.Paragraphs.get_Item(0)
# first_text = (first_paragraph.Text or "").strip()
# if first_text == watermark_text:
# section.Paragraphs.RemoveAt(0)
if remove_prefix:
self.remove_comment_prefix()
self._doc.SaveToFile(path)
......@@ -886,8 +952,9 @@ class SpireWordDoc(DocBase):
if __name__ == "__main__":
doc = SpireWordDoc()
doc.load(r"/home/ccran/lufa-contract/demo/今麦郎合同审核.docx")
print(doc._doc_name)
print("附件2《技术协议》" in doc.get_all_text())
# print(doc._doc_name)
# print("附件2《技术协议》" in doc.get_all_text())
# doc.add_chunk_comment(
# 0,
# [
......@@ -902,4 +969,6 @@ if __name__ == "__main__":
# }
# ],
# )
# doc.to_file("/home/ccran/lufa-contract/demo/今麦郎合同审核_test.docx", True)
doc.add_comment_to_first_paragraph("这是第一段的批注", "审阅助手")
doc.to_file("/home/ccran/lufa-contract/demo/今麦郎合同审核_test.docx", True)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment