Commit 49101664 by ccran

feat:添加batch;

parent 68c08496
......@@ -2,3 +2,5 @@ tmp/
*.pyc
**__pycache__**
batch/input
batch/output
\ No newline at end of file
{
"python-envs.defaultEnvManager": "ms-python.python:conda",
"python-envs.defaultPackageManager": "ms-python.python:conda"
}
\ No newline at end of file
# 批处理文档
import os
import re
import sys
sys.path.append('..')
import traceback
import concurrent.futures
from loguru import logger
from utils.common_util import random_str
from utils.http_util import upload_file, fastgpt_openai_chat, download_file
def extract_url(text):
# \s * ([ ^ "\s]+?\.(?:docx?|pdf|xlsx))
excel_p, doc_p = r'导出Excel结果\s*([^"]*xlsx)', r'导出Doc结果\s*([^\" ]+?\.(?:docx?|pdf|wps))'
# 使用 re.search() 查找第一个匹配项
excel_m, doc_m = re.search(excel_p, text), re.search(doc_p, text)
if excel_m and doc_m:
# 提取匹配的 url 部分
return excel_m.group(1), doc_m.group(1)
else:
return None, None
def process_single_file(file, batch_input_dir_path, batch_output_dir_path, counter, start_file):
"""
单文件处理逻辑,可被线程池并发调用
"""
# 文件跳过
if start_file > counter:
return
# 提取文件前缀
file_name = file[:file.rfind('.')]
ext_name = file[file.rfind('.'):]
# 源目标处理
original_file = f'{batch_input_dir_path}/{file}'
des_check_file = f'{batch_output_dir_path}/{file_name}.md'
des_excel_file = f'{batch_output_dir_path}/{file_name}.xlsx'
des_doc_file = f'{batch_output_dir_path}/{file_name}{ext_name}'
try:
# 处理原文件
file_url = upload_file(original_file, input_url_to_inner=True).replace('218.77.58.8', '192.168.252.71')
model = 'Qwen2-72B-Instruct'
# url = 'http://218.77.58.8:8088/api/v1/chat/completions'
url = 'http://192.168.252.71:18089/api/v1/chat/completions'
# 合同审核Excel工作流处理
logger.info(' 第{}个文件,处理文件: {}'.format(counter, original_file))
# # 合同审查测试token
token = 'fastgpt-ek3Z6PxI6sXgYc0jxzZ5bVGqrxwM6aVyfSmA6JVErJYBMr2KmYxrHwEUOIMSYz'
result = fastgpt_openai_chat(url, token, model, random_str(), file_url, f'0304批处理任务-{file_name}', False)
excel_url, doc_url = extract_url(result)
if excel_url and doc_url:
download_file(excel_url.replace('218.77.58.8', '192.168.252.71'), des_excel_file)
download_file(doc_url.replace('218.77.58.8', '192.168.252.71'), des_doc_file)
logger.info(f'第{counter}个文件下载:{excel_url}到{des_excel_file} {des_doc_file}')
except Exception as e:
logger.error(f'{original_file} 处理异常 第{counter}个文件: {e}')
logger.error(traceback.print_exc())
def execute_batch(max_workers: int = 4):
batch_input_dir_path = 'input'
batch_output_dir_path = 'output'
start_file = 1
dirs = os.listdir(batch_input_dir_path)
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
futures = [
executor.submit(
process_single_file,
file,
batch_input_dir_path,
batch_output_dir_path,
counter,
start_file,
)
for counter, file in enumerate(dirs, start=1)
]
# 确保异常能在主线程抛出
for f in concurrent.futures.as_completed(futures):
f.result()
if __name__ == '__main__':
execute_batch(5)
\ No newline at end of file
......@@ -26,7 +26,7 @@ class LLMConfig:
model: str = 'Qwen2-72B-Instruct'
outer_backend_url = "http://218.77.58.8:48081"
outer_backend_url = "http://znkf.lgfzgroup.com:48081"
base_fastgpt_url = "http://192.168.252.71:18089"
base_backend_url = "http://192.168.252.71:48081"
......
......@@ -24,6 +24,15 @@ summary_tool = SegmentSummaryTool()
review_tool = SegmentReviewTool()
reflect_tool = ReflectRetryTool()
@app.post("/sleep")
def sleep(t:int):
import time
time.sleep(t)
return {
'res':f'sleep over for {t} seconds.'
}
########################################################################################################################
class DocumentParseRequest(BaseModel):
......
......@@ -218,7 +218,7 @@ def find_best_match(sub_chunks, comment):
best_match = None # 存储最佳匹配的结果
best_score = -1 # 存储最高相似度(初始化为-1)
print(f"开始处理评论: {comment['original_text'][:30]}...") # 显示简化的原始评论
# print(f"开始处理评论: {comment['original_text'][:30]}...") # 显示简化的原始评论
for obj in sub_chunks:
if isinstance(obj, Paragraph):
......@@ -266,7 +266,7 @@ def table_contract(target_texts, comment):
best_match = None # 存储最佳匹配的结果
best_score = -1 # 存储最高相似度(初始化为-1)
print(f"开始处理评论: {comment['original_text'][:30]}...") # 显示简化的原始评论
# print(f"开始处理评论: {comment['original_text'][:30]}...") # 显示简化的原始评论
original_text = comment["original_text"]
for target_text in target_texts:
......@@ -555,7 +555,7 @@ class SpireWordDoc(DocBase):
)
added = True
print(f"表格批注添加成功: '{target_text[:20]}...'")
# print(f"表格批注添加成功: '{target_text[:20]}...'")
# 添加成功后跳出内层循环
break
......@@ -595,7 +595,7 @@ class SpireWordDoc(DocBase):
self._doc.Comments.get_Item(
existing_comment_idx
).Body.Paragraphs.get_Item(0).Text = suggest
print(f"批注已存在,更新内容: '{find_key[:20]}...'")
# print(f"批注已存在,更新内容: '{find_key[:20]}...'")
continue
matched = False
......@@ -617,7 +617,7 @@ class SpireWordDoc(DocBase):
elif isinstance(obj, Table):
try:
if self.add_table_comment(obj, find_key, suggest, author):
print(f"表格批注添加成功: '{find_key[:20]}...'")
# print(f"表格批注添加成功: '{find_key[:20]}...'")
matched = True
break
except Exception as e:
......@@ -636,7 +636,7 @@ class SpireWordDoc(DocBase):
if text_sel and self.set_comment_by_text_selection(
text_sel, author, suggest
):
print(f"模糊批注添加成功: '{match_text[:20]}...'")
# print(f"模糊批注添加成功: '{match_text[:20]}...'")
matched = True
break
......@@ -647,9 +647,7 @@ class SpireWordDoc(DocBase):
if text_sel and self.set_comment_by_text_selection(
text_sel, author, suggest
):
print(
f"处理后批注添加成功: '{processed_text[:20]}...'"
)
# print(f"处理后批注添加成功: '{processed_text[:20]}...'")
matched = True
break
......@@ -664,9 +662,7 @@ class SpireWordDoc(DocBase):
if best_table_match and self.add_table_comment(
obj, best_table_match, suggest, author
):
print(
f"表格批注添加成功: '{best_table_match[:20]}...'"
)
# print(f"表格批注添加成功: '{best_table_match[:20]}...'")
matched = True
break
except Exception as e:
......@@ -710,7 +706,7 @@ class SpireWordDoc(DocBase):
# 删除批注
if existing_comment_idx is not None:
self._doc.Comments.RemoveAt(existing_comment_idx)
print(f"已删除合格批注: '{author}'")
# print(f"已删除合格批注: '{author}'")
else:
# 不合格,更新或新增
suggest = comment.get("suggest", "")
......@@ -718,7 +714,7 @@ class SpireWordDoc(DocBase):
self._doc.Comments.get_Item(
existing_comment_idx
).Body.Paragraphs.get_Item(0).Text = suggest
print(f"更新已有批注: '{author}'")
# print(f"更新已有批注: '{author}'")
else:
# chunk_id要从comment中获取
self.add_chunk_comment(comment["chunk_id"] - 1, [comment])
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment