feat:添加batch;

49101664 · ccran · 68c08496 · 49101664 · 49101664 · 49101664
Commit 49101664 authored Mar 05, 2026 by ccran
10 changed files
--- a/.gitignore
+++ b/.gitignore
@@ -2,3 +2,5 @@ tmp/

 *.pyc
 **__pycache__**
+batch/input
+batch/output
\ No newline at end of file
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
+{
+    "python-envs.defaultEnvManager": "ms-python.python:conda",
+    "python-envs.defaultPackageManager": "ms-python.python:conda"
+}
\ No newline at end of file
--- a/__pycache__/main.cpython-312.pyc
+++ b/__pycache__/main.cpython-312.pyc
--- a/batch/batch.py
+++ b/batch/batch.py
+# 批处理文档
+import os
+import re
+import sys
+
+sys.path.append('..')
+import traceback
+import concurrent.futures
+
+from loguru import logger
+
+from utils.common_util import random_str
+from utils.http_util import upload_file, fastgpt_openai_chat, download_file
+
+
+def extract_url(text):
+    # \s * ([ ^ "\s]+?\.(?:docx?|pdf|xlsx))
+    excel_p, doc_p = r'导出Excel结果\s*([^"]*xlsx)', r'导出Doc结果\s*([^\" ]+?\.(?:docx?|pdf|wps))'
+    # 使用 re.search() 查找第一个匹配项
+    excel_m, doc_m = re.search(excel_p, text), re.search(doc_p, text)
+    if excel_m and doc_m:
+        # 提取匹配的 url 部分
+        return excel_m.group(1), doc_m.group(1)
+    else:
+        return None, None
+
+
+def process_single_file(file, batch_input_dir_path, batch_output_dir_path, counter, start_file):
+    """
+    单文件处理逻辑，可被线程池并发调用
+    """
+    # 文件跳过
+    if start_file > counter:
+        return
+
+    # 提取文件前缀
+    file_name = file[:file.rfind('.')]
+    ext_name = file[file.rfind('.'):]
+    # 源目标处理
+    original_file = f'{batch_input_dir_path}/{file}'
+    des_check_file = f'{batch_output_dir_path}/{file_name}.md'
+    des_excel_file = f'{batch_output_dir_path}/{file_name}.xlsx'
+    des_doc_file = f'{batch_output_dir_path}/{file_name}{ext_name}'
+
+    try:
+        # 处理原文件
+        file_url = upload_file(original_file, input_url_to_inner=True).replace('218.77.58.8', '192.168.252.71')
+        model = 'Qwen2-72B-Instruct'
+        # url = 'http://218.77.58.8:8088/api/v1/chat/completions'
+        url = 'http://192.168.252.71:18089/api/v1/chat/completions'
+        # 合同审核Excel工作流处理
+        logger.info(' 第{}个文件,处理文件: {}'.format(counter, original_file))
+        # # 合同审查测试token
+        token = 'fastgpt-ek3Z6PxI6sXgYc0jxzZ5bVGqrxwM6aVyfSmA6JVErJYBMr2KmYxrHwEUOIMSYz'
+        result = fastgpt_openai_chat(url, token, model, random_str(), file_url, f'0304批处理任务-{file_name}', False)
+        excel_url, doc_url = extract_url(result)
+        if excel_url and doc_url:
+            download_file(excel_url.replace('218.77.58.8', '192.168.252.71'), des_excel_file)
+            download_file(doc_url.replace('218.77.58.8', '192.168.252.71'), des_doc_file)
+            logger.info(f'第{counter}个文件下载:{excel_url}到{des_excel_file} {des_doc_file}')
+    except Exception as e:
+        logger.error(f'{original_file} 处理异常 第{counter}个文件: {e}')
+        logger.error(traceback.print_exc())
+
+
+def execute_batch(max_workers: int = 4):
+    batch_input_dir_path = 'input'
+    batch_output_dir_path = 'output'
+    start_file = 1
+    dirs = os.listdir(batch_input_dir_path)
+
+    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
+        futures = [
+            executor.submit(
+                process_single_file,
+                file,
+                batch_input_dir_path,
+                batch_output_dir_path,
+                counter,
+                start_file,
+            )
+            for counter, file in enumerate(dirs, start=1)
+        ]
+        # 确保异常能在主线程抛出
+        for f in concurrent.futures.as_completed(futures):
+            f.result()
+
+
+if __name__ == '__main__':
+    execute_batch(5)
\ No newline at end of file
--- a/core/__pycache__/config.cpython-312.pyc
+++ b/core/__pycache__/config.cpython-312.pyc
--- a/core/config.py
+++ b/core/config.py
@@ -26,7 +26,7 @@ class LLMConfig:
    model: str = 'Qwen2-72B-Instruct'


-outer_backend_url = "http://218.77.58.8:48081"
+outer_backend_url = "http://znkf.lgfzgroup.com:48081"
 base_fastgpt_url = "http://192.168.252.71:18089"
 base_backend_url = "http://192.168.252.71:48081"


--- a/main.py
+++ b/main.py
@@ -24,6 +24,15 @@ summary_tool = SegmentSummaryTool()
 review_tool = SegmentReviewTool()
 reflect_tool = ReflectRetryTool()

+
+@app.post("/sleep")
+def sleep(t:int):
+	import time
+	time.sleep(t)
+	return {
+		'res':f'sleep over for {t} seconds.'
+	}
+
 ########################################################################################################################

 class DocumentParseRequest(BaseModel):

--- a/utils/__pycache__/common_util.cpython-312.pyc
+++ b/utils/__pycache__/common_util.cpython-312.pyc
--- a/utils/__pycache__/http_util.cpython-312.pyc
+++ b/utils/__pycache__/http_util.cpython-312.pyc
--- a/utils/spire_word_util.py
+++ b/utils/spire_word_util.py
@@ -218,7 +218,7 @@ def find_best_match(sub_chunks, comment):
    best_match = None  # 存储最佳匹配的结果
    best_score = -1  # 存储最高相似度（初始化为-1）

-    print(f"开始处理评论: {comment['original_text'][:30]}...")  # 显示简化的原始评论
+    # print(f"开始处理评论: {comment['original_text'][:30]}...")  # 显示简化的原始评论

    for obj in sub_chunks:
        if isinstance(obj, Paragraph):
@@ -266,7 +266,7 @@ def table_contract(target_texts, comment):
    best_match = None  # 存储最佳匹配的结果
    best_score = -1  # 存储最高相似度（初始化为-1）

-    print(f"开始处理评论: {comment['original_text'][:30]}...")  # 显示简化的原始评论
+    # print(f"开始处理评论: {comment['original_text'][:30]}...")  # 显示简化的原始评论

    original_text = comment["original_text"]
    for target_text in target_texts:
@@ -555,7 +555,7 @@ class SpireWordDoc(DocBase):
                        )

                        added = True
-                        print(f"表格批注添加成功: '{target_text[:20]}...'")
+                        # print(f"表格批注添加成功: '{target_text[:20]}...'")

                        # 添加成功后跳出内层循环
                        break
@@ -595,7 +595,7 @@ class SpireWordDoc(DocBase):
                self._doc.Comments.get_Item(
                    existing_comment_idx
                ).Body.Paragraphs.get_Item(0).Text = suggest
-                print(f"批注已存在，更新内容: '{find_key[:20]}...'")
+                # print(f"批注已存在，更新内容: '{find_key[:20]}...'")
                continue

            matched = False
@@ -617,7 +617,7 @@ class SpireWordDoc(DocBase):
                elif isinstance(obj, Table):
                    try:
                        if self.add_table_comment(obj, find_key, suggest, author):
-                            print(f"表格批注添加成功: '{find_key[:20]}...'")
+                            # print(f"表格批注添加成功: '{find_key[:20]}...'")
                            matched = True
                            break
                    except Exception as e:
@@ -636,7 +636,7 @@ class SpireWordDoc(DocBase):
                            if text_sel and self.set_comment_by_text_selection(
                                text_sel, author, suggest
                            ):
-                                print(f"模糊批注添加成功: '{match_text[:20]}...'")
+                                # print(f"模糊批注添加成功: '{match_text[:20]}...'")
                                matched = True
                                break

@@ -647,9 +647,7 @@ class SpireWordDoc(DocBase):
                                if text_sel and self.set_comment_by_text_selection(
                                    text_sel, author, suggest
                                ):
-                                    print(
-                                        f"处理后批注添加成功: '{processed_text[:20]}...'"
-                                    )
+                                    # print(f"处理后批注添加成功: '{processed_text[:20]}...'")
                                    matched = True
                                    break

@@ -664,9 +662,7 @@ class SpireWordDoc(DocBase):
                                if best_table_match and self.add_table_comment(
                                    obj, best_table_match, suggest, author
                                ):
-                                    print(
-                                        f"表格批注添加成功: '{best_table_match[:20]}...'"
-                                    )
+                                    # print(f"表格批注添加成功: '{best_table_match[:20]}...'")
                                    matched = True
                                    break
                except Exception as e:
@@ -710,7 +706,7 @@ class SpireWordDoc(DocBase):
                # 删除批注
                if existing_comment_idx is not None:
                    self._doc.Comments.RemoveAt(existing_comment_idx)
-                    print(f"已删除合格批注: '{author}'")
+                    # print(f"已删除合格批注: '{author}'")
            else:
                # 不合格，更新或新增
                suggest = comment.get("suggest", "")
@@ -718,7 +714,7 @@ class SpireWordDoc(DocBase):
                    self._doc.Comments.get_Item(
                        existing_comment_idx
                    ).Body.Paragraphs.get_Item(0).Text = suggest
-                    print(f"更新已有批注: '{author}'")
+                    # print(f"更新已有批注: '{author}'")
                else:
                    # chunk_id要从comment中获取
                    self.add_chunk_comment(comment["chunk_id"] - 1, [comment])