Skip to content
Toggle navigation
P
Projects
G
Groups
S
Snippets
Help
ccran
/
lufa-contract
This project
Loading...
Sign in
Toggle navigation
Go to a project
Project
Repository
Issues
0
Merge Requests
0
Pipelines
Wiki
Snippets
Members
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Commit
300ece59
authored
May 27, 2026
by
ccran
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
feat: add paddle ocr;
parent
0b37ae6f
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
95 additions
and
3 deletions
+95
-3
core/config.py
+1
-1
data/rules.xlsx
+0
-0
requirements.txt
+0
-2
utils/paddle_ocr_util.py
+94
-0
No files found.
core/config.py
View file @
300ece59
...
...
@@ -46,7 +46,7 @@ FILE_SUFFIX = "-审核批注"
## 关键参数**
use_non_fastgpt_llm
=
False
use_lufa
=
Fals
e
use_lufa
=
Tru
e
use_jp_machine
=
True
## 关键参数**
...
...
data/rules.xlsx
View file @
300ece59
No preview for this file type
requirements.txt
View file @
300ece59
...
...
@@ -65,8 +65,6 @@ setuptools==80.9.0
simplejson
==3.20.2
six
==1.17.0
sniffio
==1.3.1
spire-doc
==14.1.0
spire-pdf
==12.1.3
starlette
==0.50.0
tenacity
==9.1.2
thefuzz
==0.22.1
...
...
utils/paddle_ocr_util.py
0 → 100644
View file @
300ece59
import
asyncio
import
codecs
import
json
import
re
from
urllib
import
parse
from
urllib.parse
import
urlparse
import
aiohttp
from
aiohttp
import
ClientSession
from
loguru
import
logger
from
utils.common_util
import
random_str
from
utils.http_util
import
download_file
,
url_replace_fastgpt
class
PaddleOCRUtil
:
def
__init__
(
self
,
ocr_url
=
'http://192.168.252.71:56100/ocr/pdf-robust'
):
self
.
ocr_url
=
ocr_url
@staticmethod
def
_decode_text
(
text
):
if
text
is
None
:
return
''
if
not
isinstance
(
text
,
str
):
text
=
str
(
text
)
text
=
text
.
strip
()
if
not
text
:
return
''
# json.loads normally decodes "\u4e2d" into Chinese. Some services
# return the text field double-escaped, so decode only when needed.
if
re
.
search
(
r'\\u[0-9a-fA-F]{4}'
,
text
):
try
:
text
=
codecs
.
decode
(
text
,
'unicode_escape'
)
except
UnicodeDecodeError
:
logger
.
warning
(
'paddle ocr text unicode_escape decode failed, use raw text.'
)
return
text
def
_parse_response_text
(
self
,
response_text
):
try
:
rsp_json
=
json
.
loads
(
response_text
)
except
json
.
JSONDecodeError
as
exc
:
raise
ValueError
(
f
'Invalid paddle ocr response json: {response_text[:500]}'
)
from
exc
if
not
rsp_json
.
get
(
'ok'
)
or
rsp_json
.
get
(
'code'
)
!=
0
:
raise
ValueError
(
f
'Paddle ocr failed: {rsp_json}'
)
data
=
rsp_json
.
get
(
'data'
)
or
{}
return
self
.
_decode_text
(
data
.
get
(
'text'
,
''
))
async
def
ocr_requests_async
(
self
,
session
,
file_path
):
logger
.
info
(
f
'paddle ocr pdf request:{file_path}'
)
with
open
(
file_path
,
'rb'
)
as
pdf_file
:
form
=
aiohttp
.
FormData
()
form
.
add_field
(
'file'
,
pdf_file
,
filename
=
file_path
.
split
(
'/'
)[
-
1
],
content_type
=
'application/pdf'
,
)
async
with
session
.
post
(
self
.
ocr_url
,
data
=
form
)
as
response
:
response_text
=
await
response
.
text
()
response
.
raise_for_status
()
return
response_text
async
def
ocr_result_pdf
(
self
,
dest_path
):
timeout
=
aiohttp
.
ClientTimeout
(
total
=
1200
)
async
with
ClientSession
(
timeout
=
timeout
)
as
session
:
response_text
=
await
self
.
ocr_requests_async
(
session
,
dest_path
)
text
=
self
.
_parse_response_text
(
response_text
)
logger
.
info
(
f
'paddle ocr pdf finish. text chars:{len(text)}'
)
return
[
text
]
def
ocr_download_path
(
self
,
url
):
logger
.
info
(
f
'paddle ocr url:{url}'
)
url
=
url_replace_fastgpt
(
url
)
url_parsed
=
urlparse
(
url
)
query_dict
=
parse
.
parse_qs
(
url_parsed
.
query
)
if
'filename'
in
query_dict
:
filename
=
query_dict
.
get
(
'filename'
)[
0
]
else
:
filename
=
f
'{random_str()}.pdf'
dest_path
=
f
'ocr/{filename}'
download_file
(
url
,
dest_path
)
return
dest_path
if
__name__
==
'__main__'
:
ocr_util
=
PaddleOCRUtil
()
result
=
asyncio
.
run
(
ocr_util
.
ocr_result_pdf
(
'demo/2020100593中建大成建筑(B类).pdf'
))
print
(
f
'len(result):{len(result)}'
)
print
(
result
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment