Commit 05df4d3c by ccran

feat: update lufa prompt;

parent 58af8ced
...@@ -8,6 +8,7 @@ from core.config import pdf_support_formats ...@@ -8,6 +8,7 @@ from core.config import pdf_support_formats
MAX_CACHE = 128 MAX_CACHE = 128
def _normalize_file_ext(file_ext: str) -> str: def _normalize_file_ext(file_ext: str) -> str:
if not file_ext: if not file_ext:
raise ValueError("file_ext is required") raise ValueError("file_ext is required")
...@@ -24,6 +25,7 @@ def get_cached_doc_tool(conversation_id: str, file_ext: str) -> Tuple[DocBase, s ...@@ -24,6 +25,7 @@ def get_cached_doc_tool(conversation_id: str, file_ext: str) -> Tuple[DocBase, s
return SpirePdfDoc(), ext return SpirePdfDoc(), ext
return SpireWordDoc(), ext return SpireWordDoc(), ext
@lru_cache(maxsize=MAX_CACHE) @lru_cache(maxsize=MAX_CACHE)
def get_cached_memory(conversation_id: str) -> MemoryStore: def get_cached_memory(conversation_id: str) -> MemoryStore:
return MemoryStore(f'memory_store_{conversation_id}.json') return MemoryStore(f"memory_store_{conversation_id}.json")
\ No newline at end of file
...@@ -5,6 +5,13 @@ from dataclasses import dataclass ...@@ -5,6 +5,13 @@ from dataclasses import dataclass
use_docker = False use_docker = False
# @dataclass
# class LLMConfig:
# base_url: str = "https://api.deepseek.com/v1"
# api_key: str = "sk-3df81e63afe44ca39cbd7108d59bc91a"
# model: str = "deepseek-v4-pro"
@dataclass @dataclass
class LLMConfig: class LLMConfig:
base_url: str = "http://192.168.252.71:9002/v1" base_url: str = "http://192.168.252.71:9002/v1"
...@@ -17,10 +24,25 @@ MERGE_RULE_PROMPT = False ...@@ -17,10 +24,25 @@ MERGE_RULE_PROMPT = False
MAX_SINGLE_CHUNK_SIZE = 5000 MAX_SINGLE_CHUNK_SIZE = 5000
META_KEY = "META" META_KEY = "META"
DEFAULT_RULESET_ID = "通用" DEFAULT_RULESET_ID = "通用"
ALL_RULESET_IDS = ["通用", "借款", "担保", "财务口", "金盘", "金盘简化", "麓发测试"] ALL_RULESET_IDS = [
"通用",
"借款",
"担保",
"财务口",
"金盘",
"金盘简化",
"麓发测试",
"麓发标准",
]
MAX_WORKERS = 10 MAX_WORKERS = 10
FILE_SUFFIX = "-审核批注"
## 关键参数**
use_non_fastgpt_llm = False
use_lufa = True
use_jp_machine = True
use_lufa = False ## 关键参数**
if use_lufa: if use_lufa:
outer_backend_url = "http://znkf.lgfzgroup.com:48081" outer_backend_url = "http://znkf.lgfzgroup.com:48081"
base_fastgpt_url = "http://192.168.252.71:18089" base_fastgpt_url = "http://192.168.252.71:18089"
...@@ -32,9 +54,14 @@ if use_lufa: ...@@ -32,9 +54,14 @@ if use_lufa:
"fastgpt-ao3al2vgfnArt9qi2bTpPeRHouCO7qngUZiQsIM1E2x91u22z65J" "fastgpt-ao3al2vgfnArt9qi2bTpPeRHouCO7qngUZiQsIM1E2x91u22z65J"
) )
else: else:
if not use_jp_machine:
outer_backend_url = "http://218.77.58.8:48080" outer_backend_url = "http://218.77.58.8:48080"
base_fastgpt_url = "http://192.168.252.71:18088" base_fastgpt_url = "http://192.168.252.71:18088"
base_backend_url = "http://192.168.252.71:48080" base_backend_url = "http://192.168.252.71:48080"
else:
outer_backend_url = "http://172.21.107.45:48080"
base_fastgpt_url = "http://172.21.107.45:38080"
base_backend_url = "http://172.21.107.45:48080"
segment_review_api_key = ( segment_review_api_key = (
"fastgpt-vLu2JHAfqwEq5FUQhvATFDK0yDS6fs804v7KwWBMyU4sRrHzh4UGl89Zpa" "fastgpt-vLu2JHAfqwEq5FUQhvATFDK0yDS6fs804v7KwWBMyU4sRrHzh4UGl89Zpa"
) )
...@@ -55,11 +82,19 @@ if use_docker: ...@@ -55,11 +82,19 @@ if use_docker:
root_path = "/app" root_path = "/app"
LLM = { LLM = {
"base_tool_llm": LLMConfig(), "base_tool_llm": LLMConfig(),
"fastgpt_segment_review": LLMConfig( "fastgpt_segment_review": (
LLMConfig()
if use_non_fastgpt_llm
else LLMConfig(
base_url=f"{base_fastgpt_url}/api/v1", api_key=segment_review_api_key base_url=f"{base_fastgpt_url}/api/v1", api_key=segment_review_api_key
)
), ),
"fastgpt_reflect_retry": LLMConfig( "fastgpt_reflect_retry": (
LLMConfig()
if use_non_fastgpt_llm
else LLMConfig(
base_url=f"{base_fastgpt_url}/api/v1", api_key=reflect_retry_api_key base_url=f"{base_fastgpt_url}/api/v1", api_key=reflect_retry_api_key
)
), ),
} }
doc_support_formats = [".docx", ".doc", ".wps"] doc_support_formats = [".docx", ".doc", ".wps"]
......
...@@ -11,13 +11,12 @@ from uuid import uuid4 ...@@ -11,13 +11,12 @@ from uuid import uuid4
from utils.http_util import upload_file from utils.http_util import upload_file
from utils.doc_util import DocBase from utils.doc_util import DocBase
from core.config import META_KEY from core.config import META_KEY, FILE_SUFFIX, use_lufa
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
_ALLOWED_RISK_LEVELS = {"H", "M", "L", ""} _ALLOWED_RISK_LEVELS = {"H", "M", "L", "H,M", ""}
FINDING_KEY_REVIEW = "review" FINDING_KEY_REVIEW = "review"
FINDING_KEY_REFLECT = "reflect" FINDING_KEY_REFLECT = "reflect"
FINDING_KEY_MERGE = "merge" FINDING_KEY_MERGE = "merge"
...@@ -290,9 +289,10 @@ class MemoryStore: ...@@ -290,9 +289,10 @@ class MemoryStore:
raise ImportError( raise ImportError(
"openpyxl is required for export_to_excel; install via 'pip install openpyxl'" "openpyxl is required for export_to_excel; install via 'pip install openpyxl'"
) from exc ) from exc
file_suffix = FILE_SUFFIX + datetime.now().strftime("%Y%m%d_%H%M%S")
ts = datetime.now().strftime("%Y%m%d_%H%M%S") name = file_name or f"memory_export.xlsx"
name = file_name or f"memory_export_{ts}.xlsx" name = Path(name).stem + file_suffix + ".xlsx"
# print(f"Exporting to Excel with file name: {name}")
output_path = Path(__file__).resolve().parent.parent / "tmp" / name output_path = Path(__file__).resolve().parent.parent / "tmp" / name
with self._lock: with self._lock:
...@@ -381,13 +381,11 @@ class MemoryStore: ...@@ -381,13 +381,11 @@ class MemoryStore:
"""Add all findings as comments to a document, upload, then delete the local file.""" """Add all findings as comments to a document, upload, then delete the local file."""
if doc_obj is None: if doc_obj is None:
raise ValueError("doc_obj is required") raise ValueError("doc_obj is required")
# build suffix
ts = datetime.now().strftime("%Y%m%d_%H%M%S") file_suffix = FILE_SUFFIX + datetime.now().strftime("%Y%m%d_%H%M%S")
doc_name = getattr(doc_obj, "_doc_name", "") or "" # derive file name
suffix = Path(doc_name).suffix or ".docx" name = file_name or getattr(doc_obj, "_doc_name", "") or "memory_export.docx"
name = file_name or f"findings_{ts}{suffix}" name = Path(name).stem + file_suffix + (Path(name).suffix or ".docx")
if not Path(name).suffix:
name = f"{name}{suffix}"
output_path = Path(__file__).resolve().parent.parent / "tmp" / name output_path = Path(__file__).resolve().parent.parent / "tmp" / name
target_key = self._normalize_finding_key(finding_key) target_key = self._normalize_finding_key(finding_key)
...@@ -398,6 +396,7 @@ class MemoryStore: ...@@ -398,6 +396,7 @@ class MemoryStore:
for idx, f in enumerate(target_findings, start=1): for idx, f in enumerate(target_findings, start=1):
segment_id = int(f.segment_id or 0) segment_id = int(f.segment_id or 0)
chunk_id = max(segment_id, 0) chunk_id = max(segment_id, 0)
if use_lufa:
suggest_parts = [] suggest_parts = []
if f.risk_level: if f.risk_level:
suggest_parts.append(f"风险等级:{f.risk_level}") suggest_parts.append(f"风险等级:{f.risk_level}")
...@@ -406,6 +405,8 @@ class MemoryStore: ...@@ -406,6 +405,8 @@ class MemoryStore:
if f.suggestion: if f.suggestion:
suggest_parts.append(f"建议:{f.suggestion}") suggest_parts.append(f"建议:{f.suggestion}")
suggest_text = "\n".join(suggest_parts).strip() suggest_text = "\n".join(suggest_parts).strip()
else:
suggest_text = f"建议:{f.suggestion}".strip()
comments.append( comments.append(
{ {
"id": str(idx), "id": str(idx),
...@@ -517,9 +518,11 @@ def test_memory_and_export_excel(): ...@@ -517,9 +518,11 @@ def test_memory_and_export_excel():
# print("Findings search:") # print("Findings search:")
# for f in hits: # for f in hits:
# print(json.dumps(asdict(f), ensure_ascii=False, indent=2)) # print(json.dumps(asdict(f), ensure_ascii=False, indent=2))
print(store.export_to_excel()) print(store.export_to_excel("测试"))
if __name__ == "__main__": if __name__ == "__main__":
# test_export_findings_to_doc_comments("/home/ccran/lufa-contract/tmp/股份转让协议.docx") test_export_findings_to_doc_comments(
test_memory_and_export_excel() "/home/ccran/lufa-contract/tmp/1_金盘箱变采购合同.docx"
)
# test_memory_and_export_excel()
from __future__ import annotations
from typing import Any, Dict, List
from core.tool import tool, tool_func
@tool("rule_filter", "规则过滤")
class RuleFilterTool:
@tool_func(
{
"type": "object",
"properties": {
"payload": {"type": "object"},
},
"required": ["payload"],
}
)
def run(self, payload: Dict[str, Any]) -> Dict[str, Any]:
raise NotImplementedError("Subclasses must implement run")
@tool("lufa_party_rule_filter_tool", "LUFA 当事人与支付主体规则过滤")
class LufaPartyRuleFilterTool(RuleFilterTool):
@tool_func(
{
"type": "object",
"properties": {
"payload": {"type": "object"},
},
"required": ["payload"],
}
)
def run(self, payload: Dict[str, Any]) -> Dict[str, Any]:
# rules = payload.get("rules") or []
# segment_idx = int(payload.get("segment_idx", 0))
# total_segments = int(payload.get("total_segments", 0))
# if not rules or total_segments <= 0:
# payload["rules"] = rules
# return payload
# # 奇数分段时将中间段归入前半段
# first_half_count = (total_segments + 1) // 2
# if segment_idx < first_half_count:
# filtered_rules: List[Dict[str, Any]] = [
# r for r in rules if "支付主体审查" not in str(r.get("title", ""))
# ]
# else:
# filtered_rules = [
# r for r in rules if "当事人审查" not in str(r.get("title", ""))
# ]
# payload["rules"] = filtered_rules
return payload
from __future__ import annotations from __future__ import annotations
import difflib
import json import json
import re import re
import unicodedata import unicodedata
...@@ -11,7 +10,6 @@ from core.tools.segment_llm import LLMTool ...@@ -11,7 +10,6 @@ from core.tools.segment_llm import LLMTool
from loguru import logger from loguru import logger
import traceback import traceback
MERGER_SYSTEM_PROMPT = """ MERGER_SYSTEM_PROMPT = """
你将收到同一组 findings 的 issue 与 suggestion 列表,请做信息融合而非机械拼接。 你将收到同一组 findings 的 issue 与 suggestion 列表,请做信息融合而非机械拼接。
...@@ -107,17 +105,17 @@ def _merge_text_union(base: str, other: str) -> str: ...@@ -107,17 +105,17 @@ def _merge_text_union(base: str, other: str) -> str:
return f"{left}\n{right}" return f"{left}\n{right}"
def _has_substring_overlap(a: str, b: str, min_common_len: int = 8) -> bool: def _has_substring_overlap(a: str, b: str, min_common_len: int = 5) -> bool:
left = _normalize_text_for_match(str(a or "")) left = _normalize_text_for_match(str(a or ""))
right = _normalize_text_for_match(str(b or "")) right = _normalize_text_for_match(str(b or ""))
if not left or not right: if not left or not right:
return False return False
if left in right or right in left: # Only treat edge overlap as related: suffix(left)->prefix(right) or reverse.
return True overlap_len = max(
match = difflib.SequenceMatcher(None, left, right).find_longest_match( _max_suffix_prefix_overlap(left, right),
0, len(left), 0, len(right) _max_suffix_prefix_overlap(right, left),
) )
return match.size >= min_common_len return overlap_len >= min_common_len
def _normalize_text_for_match(text: str) -> str: def _normalize_text_for_match(text: str) -> str:
...@@ -225,7 +223,13 @@ def _rule_based_merge( ...@@ -225,7 +223,13 @@ def _rule_based_merge(
groups.append([findings[idx] for idx in group_idx]) groups.append([findings[idx] for idx in group_idx])
return [_merge_group(group, field_merger=field_merger) for group in groups] merged_findings: List[Dict[str, Any]] = []
for group in groups:
if _should_skip_group_merge(group):
merged_findings.extend([_normalize_finding(item) for item in group])
continue
merged_findings.append(_merge_group(group, field_merger=field_merger))
return merged_findings
def _deterministic_field_merge(group: List[Dict[str, Any]]) -> Dict[str, str]: def _deterministic_field_merge(group: List[Dict[str, Any]]) -> Dict[str, str]:
...@@ -237,6 +241,15 @@ def _deterministic_field_merge(group: List[Dict[str, Any]]) -> Dict[str, str]: ...@@ -237,6 +241,15 @@ def _deterministic_field_merge(group: List[Dict[str, Any]]) -> Dict[str, str]:
} }
def _should_skip_group_merge(group: List[Dict[str, Any]]) -> bool:
if len(group) <= 1:
return True
for item in group:
if not str(item.get("original_text", "") or "").strip():
return True
return False
@tool("segment_merger", "同证据 findings 合并") @tool("segment_merger", "同证据 findings 合并")
class SegmentMergerTool(LLMTool): class SegmentMergerTool(LLMTool):
def __init__(self) -> None: def __init__(self) -> None:
...@@ -321,30 +334,30 @@ if __name__ == "__main__": ...@@ -321,30 +334,30 @@ if __name__ == "__main__":
tool = SegmentMergerTool() tool = SegmentMergerTool()
sample = [ sample = [
{ {
"rule_title": "支付时间审查", "rule_title": "预付款审查",
"segment_id": 0, "segment_id": 0,
"original_text": "本协议约定的服务内容全部履行完毕经甲方认可,在乙方提交检测数据后15个工作日内,乙方须向甲方提供相应数额的正规发票,甲方一次性支付合同总金额的100%给乙方。", "original_text": "丙方提交的下列单据经甲方、乙方审核无误后 20 个工作日内,支付该批次设备合同价格 70% 的到款项,合同生效后,丙方提供下列材料,甲方、乙方审核无误后20个工作日内支付给丙方本合同总价的10%的款项(¥458,000元,人民币大写肆拾伍万捌仟元整)作为预付款",
"issue": "付款条件缺乏实质把控。条款将付款绑定于提交数据,未明确'经甲方认可'的验收标准、期限及异议机制,且未设置质保金,存在验收流于形式即需全额付款的风险。", "issue": "根据审查规则,预付款比例应大于等于合同总价款的20%,或约定发货前付清全款。合同第3.2.1条约定预付款比例为合同总价的10%(458,000元),未达到公司规定的20%最低比例要求,且未约定发货前付清全款,存在资金占用风险。",
"risk_level": "H", "risk_level": "H",
"suggestion": "修改为:'乙方提交报告后,甲方在X个工作日内验收。验收合格且收到发票后15个工作日内支付95%;剩余5%作为质保金,满X个月无异议后无息支付。若验收不合格,甲方有权拒付并要求整改。'", "suggestion": "将预付款比例修改为合同总价的20%。建议修改为:“合同生效后,丙方提供下列材料,甲方、乙方审核无误后20个工作日内支付给丙方本合同总价的20%的款项(¥916,000元,人民币大写玖拾壹万陆仟元整)作为预付款”。",
"result": "不合格", "result": "不合格",
}, },
{ {
"rule_title": "发票审查", "rule_title": "付款时间审查",
"segment_id": 0, "segment_id": 0,
"original_text": "在乙方提交检测数据后15个工作日内,乙方须向甲方提供相应数额的正规发票,甲方一次性支付合同总金额的100%给乙方。甲方指定由其全资子公司长沙高新控股集团有限公司(简称高新控股)承担并支付本合同约定的检查服务费,乙方向高新控股开具相应金额的增值税专用发票。", "original_text": "丙方按合同约定和交货通知单的要求交付合同设备后,现场经清点无误并验收合格,丙方提交的下列单据经甲方、乙方审核无误后 20 个工作日内,支付该批次设备合同价格 70% 的到款项",
"issue": "缺失发票税率约定。条款明确了发票类型和开具时间,但未约定适用税率,违反审查规则,可能导致后续开票金额争议或税务合规风险。", "issue": "根据审查规则,发货后的付款项(如到货款)若未约定发货前全额付款,必须以“到货 XX 天/月”或相似表述作为充分条件之一,且若有多个条件需提及“先到为准”。当前条款约定支付“到货款”的条件仅为“现场经清点无误并验收合格”,属于以验收结果作为唯一触发条件,未设置“到货 XX 天”的闭口时间限制。若买方拖延验收,将导致卖方收款时间无限期延后,不符合规则要求。",
"risk_level": "H", "risk_level": "H",
"suggestion": "补充税率约定。建议在'乙方向高新控股开具相应金额的增值税专用发票'后补充:'(税率:6%)'或根据实际业务类型补充具体税率数值。", "suggestion": "修改为:丙方按合同约定和交货通知单的要求交付合同设备后,现场经清点无误并验收合格,或自货物到达现场之日起【30】日内(以先到者为准),丙方提交的下列单据经甲方、乙方审核无误后 20 个工作日内,支付该批次设备合同价格 70% 的到款项。",
"result": "不合格", "result": "不合格",
}, },
{ {
"rule_title": "主体审查", "rule_title": "主体审查",
"segment_id": 0, "segment_id": 0,
"original_text": "委托方(甲方): 湖南麓谷发展集团有限公司... 甲方指定由其全资子公司长沙高新控股集团有限公司(简称高新控股)承担并支付本合同约定的检查服务费... 签章处:甲方:长沙高新控股集团有限公司", "original_text": "3.2.2 ...支付该批次设备合同价格70%的到款项; ... B.乙方开具给甲方、丙方开具给乙方的金额为该批次合同价格100%的增值税专用发票;",
"issue": "签约主体不一致。首部甲方为'湖南麓谷发展集团有限公司',但签章处及付款义务主体变更为'长沙高新控股集团有限公司',且未明确授权委托或变更确认条款,存在主体混同及履约风险。", "issue": "第3.2.2条约定到货款支付比例为70%,但条款要求开具金额为该批次合同价格100%的增值税专用发票。此时发票比例(100%)高于付款比例(70%)。根据审查规则,此类情况仅在合同中明确表述“货到”或“发货完成”时才为合格。虽然3.2.2条提到“交付合同设备后...验收合格”,但开票义务在3.3.1条中表述为无条件义务,未将“货到/发货完成”作为开具全额发票的强制前置条件,导致卖方在仅收到70%款项时需开具100%发票,不符合卖方利益。",
"risk_level": "H", "risk_level": "H",
"suggestion": "统一合同主体名称。若确由子公司履约,应将首部及正文甲方统一修改为'长沙高新控股集团有限公司';若由母公司签约,应在签章处由母公司盖章,并补充'指定子公司代为履行付款义务'条款。", "suggestion": '明确开票节点与付款节点的对应关系,修改为:"丙方应在合同设备发货后、买方支付到货款前,向乙方开具该批次设备金额100%的增值税专用发票。"',
"result": "不合格", "result": "不合格",
}, },
] ]
......
...@@ -14,6 +14,7 @@ from utils.common_util import random_str ...@@ -14,6 +14,7 @@ from utils.common_util import random_str
from utils.http_util import upload_file, fastgpt_openai_chat, download_file from utils.http_util import upload_file, fastgpt_openai_chat, download_file
use_lufa = False use_lufa = False
batch_size = 5
if not use_lufa: if not use_lufa:
SUFFIX = "_麓发迁移" SUFFIX = "_麓发迁移"
...@@ -26,13 +27,13 @@ if not use_lufa: ...@@ -26,13 +27,13 @@ if not use_lufa:
# 人机交互测试(测试环境) # 人机交互测试(测试环境)
# token = 'fastgpt-p189K5zoTX5wjp0dBybFCwsbWm3juIwlJxt2wTGyiaOWOANI5Y10pKEZzyt' # token = 'fastgpt-p189K5zoTX5wjp0dBybFCwsbWm3juIwlJxt2wTGyiaOWOANI5Y10pKEZzyt'
# 人机交互测试(生产环境) # 人机交互测试(生产环境)
# token = 'fastgpt-ry4jIjgNwmNgufMr5jR0ncvJVmSS4GZl4bx2ItsNPoncdQzW9Na3IP1Xrankr' # token = "fastgpt-ry4jIjgNwmNgufMr5jR0ncvJVmSS4GZl4bx2ItsNPoncdQzW9Na3IP1Xrankr"
# 提取后审查测试 # 提取后审查测试
# token = 'fastgpt-n74gGX5ZqLT6o1ysMBSGUTjIciswYOWDRfQ75krMkE5gDVDkpzsbz8u' # token = 'fastgpt-n74gGX5ZqLT6o1ysMBSGUTjIciswYOWDRfQ75krMkE5gDVDkpzsbz8u'
else: else:
SUFFIX = "_麓发" SUFFIX = "_麓发"
batch_input_dir_path = "lufa-input" batch_input_dir_path = "4.24测财务合同审核"
batch_output_dir_path = "lufa-output-standard" batch_output_dir_path = "4.24测财务合同审核-batch"
# 麓发fastgpt接口 # 麓发fastgpt接口
url = "http://192.168.252.71:18089/api/v1/chat/completions" url = "http://192.168.252.71:18089/api/v1/chat/completions"
# 麓发合同审查生产token # 麓发合同审查生产token
...@@ -41,9 +42,6 @@ else: ...@@ -41,9 +42,6 @@ else:
token = "fastgpt-mg5tQUgreJeF7peoOr5zqP0NR4EIrfS2bEVXge6FUL94Suu1TvEMR1sGNRSiV" token = "fastgpt-mg5tQUgreJeF7peoOr5zqP0NR4EIrfS2bEVXge6FUL94Suu1TvEMR1sGNRSiV"
batch_size = 5
def extract_url(text): def extract_url(text):
# \s * ([ ^ "\s]+?\.(?:docx?|pdf|xlsx)) # \s * ([ ^ "\s]+?\.(?:docx?|pdf|xlsx))
excel_p, doc_p = ( excel_p, doc_p = (
......
...@@ -328,7 +328,7 @@ def _parse_args() -> argparse.Namespace: ...@@ -328,7 +328,7 @@ def _parse_args() -> argparse.Namespace:
parser.add_argument( parser.add_argument(
"--val-dir", "--val-dir",
type=Path, type=Path,
default=base / "batch_output_0121_val", default=base / "jp-output-rj-base",
help="Directory containing extracted val xlsx files.", help="Directory containing extracted val xlsx files.",
) )
parser.add_argument( parser.add_argument(
......
...@@ -10,11 +10,11 @@ from spire.doc import Document ...@@ -10,11 +10,11 @@ from spire.doc import Document
from compare_annotation import compare_with_log from compare_annotation import compare_with_log
# Map raw comment authors to unified review item names. # Map raw comment authors to unified review item names.
COMMENT_AUTHOR_MAPPING: dict[str, str] = { COMMENT_AUTHOR_MAPPING: dict[str, str] = {
"三方货款审查": "第三方审查", "三方货款审查": "第三方审查",
"履行义务审查": "第三方审查", "履行义务审查": "第三方审查",
"债务转移审查": "第三方审查",
"违约条款审查": "违约与延期审查", "违约条款审查": "违约与延期审查",
"延期审查": "违约与延期审查", "延期审查": "违约与延期审查",
} }
...@@ -121,7 +121,7 @@ def _parse_args() -> argparse.Namespace: ...@@ -121,7 +121,7 @@ def _parse_args() -> argparse.Namespace:
parser.add_argument( parser.add_argument(
"--datasets-dir", "--datasets-dir",
type=Path, type=Path,
default=base / "results" / "jp-output-lufa-20260416-235546", default=base / "results" / "jp-output-lufa-20260511-101828",
help="Directory containing Word files with annotations.", help="Directory containing Word files with annotations.",
) )
parser.add_argument( parser.add_argument(
......
No preview for this file type
from spire.doc import *
from spire.doc.common import *
# 创建一个 Document 类对象并加载一个 Word 文档
doc = Document()
doc.LoadFromFile(
"/home/ccran/lufa-contract/demo/湖南麓谷发展集团“主数据管理系统与合同管理系统开发”项目合同协议书-审核批注20260511_153215.docx"
)
# 移除第二个注释
# doc.Comments.RemoveAt(1)
# 移除所有注释
doc.Comments.Clear()
# 保存文档
doc.SaveToFile(
"/home/ccran/lufa-contract/demo/湖南麓谷发展集团“主数据管理系统与合同管理系统开发”项目合同协议书-审核批注20260511_153215-无批注.docx"
)
doc.Close()
...@@ -14,10 +14,16 @@ from loguru import logger ...@@ -14,10 +14,16 @@ from loguru import logger
from utils.common_util import extract_url_file, format_now from utils.common_util import extract_url_file, format_now
from utils.http_util import download_file from utils.http_util import download_file
from core.cache import get_cached_doc_tool, get_cached_memory from core.cache import get_cached_doc_tool, get_cached_memory
from core.config import doc_support_formats, pdf_support_formats, MERGE_RULE_PROMPT from core.config import (
doc_support_formats,
pdf_support_formats,
MERGE_RULE_PROMPT,
use_lufa,
)
from core.tools.segment_summary import SegmentSummaryTool from core.tools.segment_summary import SegmentSummaryTool
from core.tools.segment_review import SegmentReviewTool from core.tools.segment_review import SegmentReviewTool
from core.tools.segment_rule_router import SegmentRuleRouterTool from core.tools.segment_rule_router import SegmentRuleRouterTool
from core.tools.rule_filter import LufaPartyRuleFilterTool
from core.tools.retrieve_reference import RetrieveReferenceTool from core.tools.retrieve_reference import RetrieveReferenceTool
from core.tools.reflect_retry import ReflectRetryTool from core.tools.reflect_retry import ReflectRetryTool
from core.tools.segment_merger import SegmentMergerTool from core.tools.segment_merger import SegmentMergerTool
...@@ -30,6 +36,7 @@ TMP_DIR.mkdir(parents=True, exist_ok=True) ...@@ -30,6 +36,7 @@ TMP_DIR.mkdir(parents=True, exist_ok=True)
summary_tool = SegmentSummaryTool() summary_tool = SegmentSummaryTool()
review_tool = SegmentReviewTool() review_tool = SegmentReviewTool()
rule_router_tool = SegmentRuleRouterTool() rule_router_tool = SegmentRuleRouterTool()
lufa_party_rule_filter_tool = LufaPartyRuleFilterTool()
reference_tool = RetrieveReferenceTool() reference_tool = RetrieveReferenceTool()
reflect_tool = ReflectRetryTool() reflect_tool = ReflectRetryTool()
merger_tool = SegmentMergerTool() merger_tool = SegmentMergerTool()
...@@ -59,6 +66,7 @@ class DocumentParseResponse(BaseModel): ...@@ -59,6 +66,7 @@ class DocumentParseResponse(BaseModel):
ruleset_items: List[str] ruleset_items: List[str]
text: Optional[str] = None text: Optional[str] = None
file_ext: Optional[str] = None file_ext: Optional[str] = None
file_name: Optional[str] = None
@app.post("/documents/parse", response_model=DocumentParseResponse) @app.post("/documents/parse", response_model=DocumentParseResponse)
...@@ -66,18 +74,13 @@ async def parse_document(payload: DocumentParseRequest) -> DocumentParseResponse ...@@ -66,18 +74,13 @@ async def parse_document(payload: DocumentParseRequest) -> DocumentParseResponse
if not payload.urls: if not payload.urls:
raise HTTPException(status_code=400, detail="No URLs provided") raise HTTPException(status_code=400, detail="No URLs provided")
try: try:
support_formats = list(dict.fromkeys(doc_support_formats + pdf_support_formats)) file_path = download_file(payload.urls[0], TMP_DIR)
filename = extract_url_file(payload.urls[0], support_formats) if not file_path:
except Exception as exc: raise RuntimeError("download returned empty path")
raise HTTPException(status_code=400, detail=f"Failed to parse url: {exc}")
file_path = str(TMP_DIR / filename)
try:
download_file(payload.urls[0], file_path)
except Exception as exc: except Exception as exc:
raise HTTPException(status_code=500, detail=f"Download failed: {exc}") raise HTTPException(status_code=500, detail=f"Download failed: {exc}")
# get doc tool # get doc tool
file_ext = payload.file_ext or Path(filename).suffix file_ext = payload.file_ext or Path(file_path).suffix
try: try:
doc_obj, _ = get_cached_doc_tool(payload.conversation_id, file_ext) doc_obj, _ = get_cached_doc_tool(payload.conversation_id, file_ext)
except Exception as exc: except Exception as exc:
...@@ -105,6 +108,7 @@ async def parse_document(payload: DocumentParseRequest) -> DocumentParseResponse ...@@ -105,6 +108,7 @@ async def parse_document(payload: DocumentParseRequest) -> DocumentParseResponse
segment_ids=segment_ids, segment_ids=segment_ids,
ruleset_items=ruleset_review_items, ruleset_items=ruleset_review_items,
file_ext=file_ext, file_ext=file_ext,
file_name=Path(file_path).name,
) )
...@@ -285,6 +289,22 @@ def route_segment_rules(payload: SegmentReviewRequest) -> SegmentRuleRouterRespo ...@@ -285,6 +289,22 @@ def route_segment_rules(payload: SegmentReviewRequest) -> SegmentRuleRouterRespo
ruleset_id = payload.ruleset_id or reference_tool.default_ruleset_id ruleset_id = payload.ruleset_id or reference_tool.default_ruleset_id
rules = reference_tool.run(ruleset_id=ruleset_id).get("rules", []) rules = reference_tool.run(ruleset_id=ruleset_id).get("rules", [])
if use_lufa and rules:
try:
total_segments = len(doc_obj.get_chunk_id_list() or [])
except Exception:
total_segments = 0
filtered_payload = lufa_party_rule_filter_tool.run(
{
"rules": rules,
"segment_idx": segment_idx,
"total_segments": total_segments,
}
)
rules = filtered_payload.get("rules", rules)
result = rule_router_tool.run( result = rule_router_tool.run(
segment_id=segment_idx, segment_id=segment_idx,
segment_text=segment_text, segment_text=segment_text,
...@@ -508,7 +528,9 @@ def export_memory(payload: MemoryExportRequest) -> MemoryExportResponse: ...@@ -508,7 +528,9 @@ def export_memory(payload: MemoryExportRequest) -> MemoryExportResponse:
try: try:
doc_res = store.export_findings_to_doc_comments( doc_res = store.export_findings_to_doc_comments(
doc_obj, finding_key=payload.finding_key or FINDING_KEY_REVIEW doc_obj,
file_name=payload.file_name,
finding_key=payload.finding_key or FINDING_KEY_REVIEW,
) )
except Exception as exc: except Exception as exc:
traceback.print_exc() traceback.print_exc()
......
from main import FactsRetrieveRequest, retrieve_facts
from core.cache import get_cached_memory
import json
def test_retrieve_facts_direct() -> None:
conversation_id = "fa86563cb6c649d59e32e7def16ea6b2"
payload = FactsRetrieveRequest(
conversation_id=conversation_id,
keywords=["当事人"],
)
res = retrieve_facts(payload)
print(json.dumps(res.facts,ensure_ascii=False, indent=4))
if __name__ == "__main__":
test_retrieve_facts_direct()
...@@ -9,7 +9,7 @@ from core.config import max_chunk_page, min_single_chunk_size, max_single_chunk_ ...@@ -9,7 +9,7 @@ from core.config import max_chunk_page, min_single_chunk_size, max_single_chunk_
def random_str(l=5): def random_str(l=5):
return ''.join(random.sample('abcdefghijklmnopqrstuvwxyz', l)) return "".join(random.sample("abcdefghijklmnopqrstuvwxyz", l))
def format_now(): def format_now():
...@@ -19,12 +19,14 @@ def format_now(): ...@@ -19,12 +19,14 @@ def format_now():
# 从url中提取文件名称 # 从url中提取文件名称
def extract_url_file(url, support_formats): def extract_url_file(url, support_formats):
pattern = '|'.join([r'[\u4e00-\u9fa5()()0-9\w-]+' + format for format in support_formats]) pattern = "|".join(
[r"[\u4e00-\u9fa5()()0-9\w-]+" + format for format in support_formats]
)
search_result = re.search(pattern, url) search_result = re.search(pattern, url)
if search_result: if search_result:
return search_result.group() return search_result.group()
else: else:
raise Exception(f'{support_formats} not found in url:{url}') raise Exception(f"{support_formats} not found in url:{url}")
# 调整单个页面数量 # 调整单个页面数量
...@@ -34,7 +36,7 @@ def adjust_single_chunk_size(all_text_len): ...@@ -34,7 +36,7 @@ def adjust_single_chunk_size(all_text_len):
# 从JSON字符串提取JSON对象 # 从JSON字符串提取JSON对象
def extract_json(json_str:str) -> List[Dict]: def extract_json(json_str: str) -> List[Dict]:
"""从字符串中提取 JSON 对象列表。 """从字符串中提取 JSON 对象列表。
优先提取 ```json ... ``` 代码块;若不存在,尝试: 优先提取 ```json ... ``` 代码块;若不存在,尝试:
...@@ -43,12 +45,13 @@ def extract_json(json_str:str) -> List[Dict]: ...@@ -43,12 +45,13 @@ def extract_json(json_str:str) -> List[Dict]:
- 从任意包含花括号/方括号的片段尝试解析 - 从任意包含花括号/方括号的片段尝试解析
返回解析成功的 JSON 对象列表(数组会被展开)。 返回解析成功的 JSON 对象列表(数组会被展开)。
""" """
def _try_parse_to_list(candidate: str, out_list: list) -> bool: def _try_parse_to_list(candidate: str, out_list: list) -> bool:
s = (candidate or '').strip() s = (candidate or "").strip()
if not s: if not s:
return False return False
# 清理控制字符 # 清理控制字符
s = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', '', s) s = re.sub(r"[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]", "", s)
try: try:
obj = json_repair.loads(s, strict=False) obj = json_repair.loads(s, strict=False)
if isinstance(obj, list): if isinstance(obj, list):
...@@ -63,26 +66,26 @@ def extract_json(json_str:str) -> List[Dict]: ...@@ -63,26 +66,26 @@ def extract_json(json_str:str) -> List[Dict]:
results = [] results = []
# 1. 提取 ```json ... ``` 代码块 # 1. 提取 ```json ... ``` 代码块
fenced_json_pattern = r'```json([\s\S]*?)```' fenced_json_pattern = r"```json([\s\S]*?)```"
for match in re.findall(fenced_json_pattern, json_str or '', re.DOTALL): for match in re.findall(fenced_json_pattern, json_str or "", re.DOTALL):
_try_parse_to_list(match, results) _try_parse_to_list(match, results)
if results: if results:
return results return results
# 2. 尝试将全文解析为 JSON # 2. 尝试将全文解析为 JSON
if _try_parse_to_list(json_str or '', results): if _try_parse_to_list(json_str or "", results):
return results return results
# 3. 提取普通 ``` ... ``` 代码块,尝试解析 # 3. 提取普通 ``` ... ``` 代码块,尝试解析
fenced_any_pattern = r'```([\s\S]*?)```' fenced_any_pattern = r"```([\s\S]*?)```"
for match in re.findall(fenced_any_pattern, json_str or '', re.DOTALL): for match in re.findall(fenced_any_pattern, json_str or "", re.DOTALL):
if _try_parse_to_list(match, results): if _try_parse_to_list(match, results):
return results return results
# 4. 从包含花括号/方括号的片段尝试解析(启发式,尽力而为) # 4. 从包含花括号/方括号的片段尝试解析(启发式,尽力而为)
bracket_pattern = r'(\{[\s\S]*?\}|\[[\s\S]*?\])' bracket_pattern = r"(\{[\s\S]*?\}|\[[\s\S]*?\])"
for match in re.findall(bracket_pattern, json_str or '', re.DOTALL): for match in re.findall(bracket_pattern, json_str or "", re.DOTALL):
_try_parse_to_list(match, results) _try_parse_to_list(match, results)
return results return results
...@@ -90,11 +93,7 @@ def extract_json(json_str:str) -> List[Dict]: ...@@ -90,11 +93,7 @@ def extract_json(json_str:str) -> List[Dict]:
def remove_duplicates_by_key(data_list, key): def remove_duplicates_by_key(data_list, key):
# 先按字符串长度从长到短排序 # 先按字符串长度从长到短排序
sorted_list = sorted( sorted_list = sorted(data_list, key=lambda x: len(x.get(key, "")), reverse=True)
data_list,
key=lambda x: len(x.get(key, "")),
reverse=True
)
result = [] result = []
seen_strings = [] seen_strings = []
...@@ -109,12 +108,14 @@ def remove_duplicates_by_key(data_list, key): ...@@ -109,12 +108,14 @@ def remove_duplicates_by_key(data_list, key):
def extract_drop_json_part(json_str): def extract_drop_json_part(json_str):
json_pattern = r'```json([\s\S]*?)```' json_pattern = r"```json([\s\S]*?)```"
non_json_content = re.sub(json_pattern, '', json_str, re.DOTALL) non_json_content = re.sub(json_pattern, "", json_str, re.DOTALL)
return non_json_content.strip() return non_json_content.strip()
def group_chunk_by_len(chunk_list: List[Dict], key: str, chunk_len: int) -> List[List[Dict]]: def group_chunk_by_len(
chunk_list: List[Dict], key: str, chunk_len: int
) -> List[List[Dict]]:
ret_chunk_list = [] ret_chunk_list = []
sub_chunk_list = [] sub_chunk_list = []
current_acc_len = 0 # 用于记录当前 sub_chunk 的累积长度 current_acc_len = 0 # 用于记录当前 sub_chunk 的累积长度
...@@ -144,7 +145,10 @@ def group_chunk_by_len(chunk_list: List[Dict], key: str, chunk_len: int) -> List ...@@ -144,7 +145,10 @@ def group_chunk_by_len(chunk_list: List[Dict], key: str, chunk_len: int) -> List
return ret_chunk_list return ret_chunk_list
if __name__ == '__main__': if __name__ == "__main__":
json_str = '```json{"segment_id": "seg-001"}```' json_str = '```json{"segment_id": "seg-001"}```'
print(extract_json(json_str)) print(extract_json(json_str))
url = "/api/common/file/read/今麦郎合同审核.docx?token=eyJhbGciOiJ.kpXVCJ9.1xfdsa"
print(extract_url_file(url, [".docx", ".doc", ".wps"]))
pass pass
import json import json
import os import os
import re
from pathlib import Path
from urllib.parse import unquote, urlparse
import requests import requests
from loguru import logger from loguru import logger
...@@ -82,6 +85,26 @@ def upload_file(path, input_url_to_inner=True, output_url_to_inner=False) -> str ...@@ -82,6 +85,26 @@ def upload_file(path, input_url_to_inner=True, output_url_to_inner=False) -> str
raise Exception(f"上传{path}失败 Response text: {response.text}") raise Exception(f"上传{path}失败 Response text: {response.text}")
def _resolve_download_filename(url: str, response: requests.Response) -> str:
content_disposition = response.headers.get("content-disposition", "")
if content_disposition:
match = re.search(
r"filename\*=(?:UTF-8''|utf-8'')?([^;]+)", content_disposition
)
if match:
return unquote(match.group(1).strip().strip('"'))
match = re.search(r'filename="?([^";]+)"?', content_disposition)
if match:
return unquote(match.group(1).strip().strip('"'))
url_filename = Path(urlparse(url).path).name
if url_filename:
return url_filename
return "downloaded_file"
# 下载url到本地path # 下载url到本地path
def download_file(url, path, input_url_to_inner=True): def download_file(url, path, input_url_to_inner=True):
if not url.startswith("http:"): if not url.startswith("http:"):
...@@ -92,13 +115,21 @@ def download_file(url, path, input_url_to_inner=True): ...@@ -92,13 +115,21 @@ def download_file(url, path, input_url_to_inner=True):
response = requests.get(url) response = requests.get(url)
# 确保请求成功 # 确保请求成功
if response.status_code == 200: if response.status_code == 200:
target_path = Path(path)
if target_path.exists() and target_path.is_dir():
target_path = target_path / _resolve_download_filename(url, response)
target_path.parent.mkdir(parents=True, exist_ok=True)
# 打开本地文件,准备写入数据 # 打开本地文件,准备写入数据
with open(path, "wb") as f: with open(target_path, "wb") as f:
# 写入响应的内容 # 写入响应的内容
f.write(response.content) f.write(response.content)
logger.info(f"{url}文件下载成功,保存到{path}") logger.info(f"{url}文件下载成功,保存到{target_path}")
return str(target_path)
else: else:
logger.error(f"{url}文件下载失败. HTTP Status Code: {response.status_code}") logger.error(f"{url}文件下载失败. HTTP Status Code: {response.status_code}")
return None
def url_replace_fastgpt(origin: str): def url_replace_fastgpt(origin: str):
......
...@@ -4,14 +4,7 @@ from openai import AsyncOpenAI ...@@ -4,14 +4,7 @@ from openai import AsyncOpenAI
from dataclasses import dataclass from dataclasses import dataclass
from tenacity import retry, stop_after_attempt, stop_after_delay, wait_fixed from tenacity import retry, stop_after_attempt, stop_after_delay, wait_fixed
import asyncio import asyncio
from core.config import MAX_WORKERS from core.config import MAX_WORKERS, LLMConfig, use_non_fastgpt_llm
@dataclass
class LLMConfig:
base_url: str
api_key: str
model: str
class OpenAITool: class OpenAITool:
...@@ -25,10 +18,15 @@ class OpenAITool: ...@@ -25,10 +18,15 @@ class OpenAITool:
@retry(stop=stop_after_delay(600) | stop_after_attempt(3), wait=wait_fixed(1)) @retry(stop=stop_after_delay(600) | stop_after_attempt(3), wait=wait_fixed(1))
async def chat(self, msg, tools=None): async def chat(self, msg, tools=None):
if tools is None: if tools is None:
extra_body = None extra_body = {}
# fastgpt专用:如果第一个消息是system角色,则将其内容放入extra_body.variables.system,并从消息列表中移除
if not use_non_fastgpt_llm:
if msg[0]["role"] == "system": if msg[0]["role"] == "system":
extra_body = {"variables": {"system": msg[0]["content"]}} extra_body = {"variables": {"system": msg[0]["content"]}}
msg = msg[1:] msg = msg[1:]
# deepseek专用关闭思考
extra_body["thinking"] = {"type": "disabled"}
try:
response = await self.client.chat.completions.create( response = await self.client.chat.completions.create(
model=self.llm_config.model, messages=msg, extra_body=extra_body model=self.llm_config.model, messages=msg, extra_body=extra_body
) )
...@@ -36,6 +34,9 @@ class OpenAITool: ...@@ -36,6 +34,9 @@ class OpenAITool:
reasoning_content = response.choices[0].message.model_extra.get( reasoning_content = response.choices[0].message.model_extra.get(
"reasoning_content", "" "reasoning_content", ""
) )
except Exception as e:
logger.error(f"LLM调用失败: {e} | response: {response}")
raise e
return content return content
else: else:
response = await self.client.chat.completions.create( response = await self.client.chat.completions.create(
...@@ -54,3 +55,19 @@ class OpenAITool: ...@@ -54,3 +55,19 @@ class OpenAITool:
return await self.chat(m, tools) return await self.chat(m, tools)
return await asyncio.gather(*[_wrapped(m) for m in msgs]) return await asyncio.gather(*[_wrapped(m) for m in msgs])
if __name__ == "__main__":
import json
llm_config = LLMConfig()
tool = OpenAITool(llm_config)
messages = [
{
"role": "system",
"content": "你是我的人工智能助手,协助我分析问题并提供建议。",
},
{"role": "user", "content": "请分析以下问题:为什么天空是蓝色的?"},
]
response = asyncio.run(tool.chat(messages))
print("LLM Response:", response)
...@@ -4,6 +4,7 @@ import re ...@@ -4,6 +4,7 @@ import re
from thefuzz import fuzz from thefuzz import fuzz
from utils.doc_util import DocBase from utils.doc_util import DocBase
from utils.common_util import adjust_single_chunk_size from utils.common_util import adjust_single_chunk_size
from core.config import use_lufa
import os import os
...@@ -713,6 +714,40 @@ class SpireWordDoc(DocBase): ...@@ -713,6 +714,40 @@ class SpireWordDoc(DocBase):
text_range = text_sel.GetAsOneRange() text_range = text_sel.GetAsOneRange()
return self._insert_comment_by_text_range(text_range, author, comment_content) return self._insert_comment_by_text_range(text_range, author, comment_content)
def add_comment_to_first_paragraph(
self, comment_text, author="审阅助手", target_text=None
):
"""
将批注直接添加到第一节第一个段落。
保留 target_text 参数仅为兼容旧调用。
"""
self._ensure_loaded()
if self._doc.Sections.Count == 0:
logger.error("文档中未找到任何节,无法添加批注")
return False
section = self._doc.Sections.get_Item(0)
if section.Paragraphs.Count == 0:
logger.error("第一节未找到段落,无法添加批注")
return False
paragraph = section.Paragraphs.get_Item(0)
comment = Comment(self._doc)
comment.Body.AddParagraph().Text = comment_text
comment.Format.Author = author
paragraph.ChildObjects.Add(comment)
comment_start = CommentMark(self._doc, CommentMarkType.CommentStart)
comment_end = CommentMark(self._doc, CommentMarkType.CommentEnd)
comment_start.CommentId = comment.Format.CommentId
comment_end.CommentId = comment.Format.CommentId
paragraph.ChildObjects.Insert(0, comment_start)
paragraph.ChildObjects.Add(comment_end)
return True
# 设置chunk批注 # 设置chunk批注
def add_table_comment( def add_table_comment(
self, table, target_text, comment_text, author="审阅助手", initials="AI" self, table, target_text, comment_text, author="审阅助手", initials="AI"
...@@ -778,6 +813,25 @@ class SpireWordDoc(DocBase): ...@@ -778,6 +813,25 @@ class SpireWordDoc(DocBase):
) )
author = self.format_comment_author(comment) author = self.format_comment_author(comment)
suggest = comment.get("suggest", "") suggest = comment.get("suggest", "")
original_text = (comment.get("original_text") or "").strip()
# original_text 为空时,直接落在文档首个可用段落。
if not original_text:
existing_comment_idx = self.find_comment(author)
if existing_comment_idx is not None:
self._update_comment_content(existing_comment_idx, suggest)
continue
first_para_author = self._decorate_author_with_match_type(
author, "exact"
)
matched = self.add_comment_to_first_paragraph(
suggest, first_para_author
)
if not matched:
logger.error("original_text 为空,且未能在首段落添加批注")
continue
find_key = comment["original_text"].strip() or comment["key_points"] find_key = comment["original_text"].strip() or comment["key_points"]
# 先检查是否已有同一“规则ID|要点”的批注,避免重复插入。 # 先检查是否已有同一“规则ID|要点”的批注,避免重复插入。
...@@ -868,6 +922,18 @@ class SpireWordDoc(DocBase): ...@@ -868,6 +922,18 @@ class SpireWordDoc(DocBase):
def to_file(self, path, remove_prefix=False): def to_file(self, path, remove_prefix=False):
self._ensure_loaded() self._ensure_loaded()
# watermark_text = (
# "Evaluation Warning: The document was created with Spire.Doc for Python."
# )
# if self._doc.Sections.Count > 0:
# section = self._doc.Sections.get_Item(0)
# if section.Paragraphs.Count > 0:
# first_paragraph = section.Paragraphs.get_Item(0)
# first_text = (first_paragraph.Text or "").strip()
# if first_text == watermark_text:
# section.Paragraphs.RemoveAt(0)
if remove_prefix: if remove_prefix:
self.remove_comment_prefix() self.remove_comment_prefix()
self._doc.SaveToFile(path) self._doc.SaveToFile(path)
...@@ -886,8 +952,9 @@ class SpireWordDoc(DocBase): ...@@ -886,8 +952,9 @@ class SpireWordDoc(DocBase):
if __name__ == "__main__": if __name__ == "__main__":
doc = SpireWordDoc() doc = SpireWordDoc()
doc.load(r"/home/ccran/lufa-contract/demo/今麦郎合同审核.docx") doc.load(r"/home/ccran/lufa-contract/demo/今麦郎合同审核.docx")
print(doc._doc_name)
print("附件2《技术协议》" in doc.get_all_text()) # print(doc._doc_name)
# print("附件2《技术协议》" in doc.get_all_text())
# doc.add_chunk_comment( # doc.add_chunk_comment(
# 0, # 0,
# [ # [
...@@ -902,4 +969,6 @@ if __name__ == "__main__": ...@@ -902,4 +969,6 @@ if __name__ == "__main__":
# } # }
# ], # ],
# ) # )
# doc.to_file("/home/ccran/lufa-contract/demo/今麦郎合同审核_test.docx", True)
doc.add_comment_to_first_paragraph("这是第一段的批注", "审阅助手")
doc.to_file("/home/ccran/lufa-contract/demo/今麦郎合同审核_test.docx", True)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment