feat: add paddle ocr;

300ece59 · ccran · 0b37ae6f · 300ece59 · 300ece59 · 300ece59
Commit 300ece59 authored May 27, 2026 by ccran
Hide whitespace changes
Inline Side-by-side

Showing with 95 additions and 3 deletions

core/config.py
+1 -1

data/rules.xlsx
+0 -0

requirements.txt
+0 -2

utils/paddle_ocr_util.py
+94 -0

No files found.
--- a/core/config.py
+++ b/core/config.py
@@ -46,7 +46,7 @@ FILE_SUFFIX = "-审核批注"
 ## 关键参数**
 use_non_fastgpt_llm = False
-use_lufa = False
+use_lufa = True
 use_jp_machine = True
 ## 关键参数**

--- a/data/rules.xlsx
+++ b/data/rules.xlsx
--- a/requirements.txt
+++ b/requirements.txt
@@ -65,8 +65,6 @@ setuptools==80.9.0
 simplejson==3.20.2
 six==1.17.0
 sniffio==1.3.1
-spire-doc==14.1.0
-spire-pdf==12.1.3
 starlette==0.50.0
 tenacity==9.1.2
 thefuzz==0.22.1

--- a/utils/paddle_ocr_util.py
+++ b/utils/paddle_ocr_util.py
+import asyncio
+import codecs
+import json
+import re
+from urllib import parse
+from urllib.parse import urlparse
+import aiohttp
+from aiohttp import ClientSession
+from loguru import logger
+from utils.common_util import random_str
+from utils.http_util import download_file, url_replace_fastgpt
+class PaddleOCRUtil:
+    def __init__(self, ocr_url='http://192.168.252.71:56100/ocr/pdf-robust'):
+        self.ocr_url = ocr_url
+    @staticmethod
+    def _decode_text(text):
+        if text is None:
+            return ''
+        if not isinstance(text, str):
+            text = str(text)
+        text = text.strip()
+        if not text:
+            return ''
+        # json.loads normally decodes "\u4e2d" into Chinese. Some services
+        # return the text field double-escaped, so decode only when needed.
+        if re.search(r'\\u[0-9a-fA-F]{4}', text):
+            try:
+                text = codecs.decode(text, 'unicode_escape')
+            except UnicodeDecodeError:
+                logger.warning('paddle ocr text unicode_escape decode failed, use raw text.')
+        return text
+    def _parse_response_text(self, response_text):
+        try:
+            rsp_json = json.loads(response_text)
+        except json.JSONDecodeError as exc:
+            raise ValueError(f'Invalid paddle ocr response json: {response_text[:500]}') from exc
+        if not rsp_json.get('ok') or rsp_json.get('code') != 0:
+            raise ValueError(f'Paddle ocr failed: {rsp_json}')
+        data = rsp_json.get('data') or {}
+        return self._decode_text(data.get('text', ''))
+    async def ocr_requests_async(self, session, file_path):
+        logger.info(f'paddle ocr pdf request:{file_path}')
+        with open(file_path, 'rb') as pdf_file:
+            form = aiohttp.FormData()
+            form.add_field(
+                'file',
+                pdf_file,
+                filename=file_path.split('/')[-1],
+                content_type='application/pdf',
+            )
+            async with session.post(self.ocr_url, data=form) as response:
+                response_text = await response.text()
+                response.raise_for_status()
+                return response_text
+    async def ocr_result_pdf(self, dest_path):
+        timeout = aiohttp.ClientTimeout(total=1200)
+        async with ClientSession(timeout=timeout) as session:
+            response_text = await self.ocr_requests_async(session, dest_path)
+        text = self._parse_response_text(response_text)
+        logger.info(f'paddle ocr pdf finish. text chars:{len(text)}')
+        return [text]
+    def ocr_download_path(self, url):
+        logger.info(f'paddle ocr url:{url}')
+        url = url_replace_fastgpt(url)
+        url_parsed = urlparse(url)
+        query_dict = parse.parse_qs(url_parsed.query)
+        if 'filename' in query_dict:
+            filename = query_dict.get('filename')[0]
+        else:
+            filename = f'{random_str()}.pdf'
+        dest_path = f'ocr/{filename}'
+        download_file(url, dest_path)
+        return dest_path
+if __name__ == '__main__':
+    ocr_util = PaddleOCRUtil()
+    result = asyncio.run(ocr_util.ocr_result_pdf('demo/2020100593中建大成建筑（B类）.pdf'))
+    print(f'len(result):{len(result)}')
+    print(result)