Commit 300ece59 by ccran

feat: add paddle ocr;

parent 0b37ae6f
...@@ -46,7 +46,7 @@ FILE_SUFFIX = "-审核批注" ...@@ -46,7 +46,7 @@ FILE_SUFFIX = "-审核批注"
## 关键参数** ## 关键参数**
use_non_fastgpt_llm = False use_non_fastgpt_llm = False
use_lufa = False use_lufa = True
use_jp_machine = True use_jp_machine = True
## 关键参数** ## 关键参数**
......
No preview for this file type
...@@ -65,8 +65,6 @@ setuptools==80.9.0 ...@@ -65,8 +65,6 @@ setuptools==80.9.0
simplejson==3.20.2 simplejson==3.20.2
six==1.17.0 six==1.17.0
sniffio==1.3.1 sniffio==1.3.1
spire-doc==14.1.0
spire-pdf==12.1.3
starlette==0.50.0 starlette==0.50.0
tenacity==9.1.2 tenacity==9.1.2
thefuzz==0.22.1 thefuzz==0.22.1
......
import asyncio
import codecs
import json
import re
from urllib import parse
from urllib.parse import urlparse
import aiohttp
from aiohttp import ClientSession
from loguru import logger
from utils.common_util import random_str
from utils.http_util import download_file, url_replace_fastgpt
class PaddleOCRUtil:
def __init__(self, ocr_url='http://192.168.252.71:56100/ocr/pdf-robust'):
self.ocr_url = ocr_url
@staticmethod
def _decode_text(text):
if text is None:
return ''
if not isinstance(text, str):
text = str(text)
text = text.strip()
if not text:
return ''
# json.loads normally decodes "\u4e2d" into Chinese. Some services
# return the text field double-escaped, so decode only when needed.
if re.search(r'\\u[0-9a-fA-F]{4}', text):
try:
text = codecs.decode(text, 'unicode_escape')
except UnicodeDecodeError:
logger.warning('paddle ocr text unicode_escape decode failed, use raw text.')
return text
def _parse_response_text(self, response_text):
try:
rsp_json = json.loads(response_text)
except json.JSONDecodeError as exc:
raise ValueError(f'Invalid paddle ocr response json: {response_text[:500]}') from exc
if not rsp_json.get('ok') or rsp_json.get('code') != 0:
raise ValueError(f'Paddle ocr failed: {rsp_json}')
data = rsp_json.get('data') or {}
return self._decode_text(data.get('text', ''))
async def ocr_requests_async(self, session, file_path):
logger.info(f'paddle ocr pdf request:{file_path}')
with open(file_path, 'rb') as pdf_file:
form = aiohttp.FormData()
form.add_field(
'file',
pdf_file,
filename=file_path.split('/')[-1],
content_type='application/pdf',
)
async with session.post(self.ocr_url, data=form) as response:
response_text = await response.text()
response.raise_for_status()
return response_text
async def ocr_result_pdf(self, dest_path):
timeout = aiohttp.ClientTimeout(total=1200)
async with ClientSession(timeout=timeout) as session:
response_text = await self.ocr_requests_async(session, dest_path)
text = self._parse_response_text(response_text)
logger.info(f'paddle ocr pdf finish. text chars:{len(text)}')
return [text]
def ocr_download_path(self, url):
logger.info(f'paddle ocr url:{url}')
url = url_replace_fastgpt(url)
url_parsed = urlparse(url)
query_dict = parse.parse_qs(url_parsed.query)
if 'filename' in query_dict:
filename = query_dict.get('filename')[0]
else:
filename = f'{random_str()}.pdf'
dest_path = f'ocr/{filename}'
download_file(url, dest_path)
return dest_path
if __name__ == '__main__':
ocr_util = PaddleOCRUtil()
result = asyncio.run(ocr_util.ocr_result_pdf('demo/2020100593中建大成建筑(B类).pdf'))
print(f'len(result):{len(result)}')
print(result)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment