2026-03-16 11:57:07 +08:00
|
|
|
|
"""
|
|
|
|
|
|
PaddleOCR REST API Server
|
|
|
|
|
|
用于 Java 后端调用的 OCR 服务
|
|
|
|
|
|
|
|
|
|
|
|
功能:
|
|
|
|
|
|
- 封装 PaddleOCRVL 和 PP-OCRv5
|
|
|
|
|
|
- 提供 PDF 处理接口
|
|
|
|
|
|
- 提供图像识别接口
|
|
|
|
|
|
- 健康检查接口
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
import os
|
|
|
|
|
|
import sys
|
|
|
|
|
|
import json
|
|
|
|
|
|
import logging
|
|
|
|
|
|
from pathlib import Path
|
|
|
|
|
|
from flask import Flask, request, jsonify
|
|
|
|
|
|
from paddleocr import PaddleOCR, PaddleOCRVL
|
|
|
|
|
|
|
|
|
|
|
|
# 配置日志
|
|
|
|
|
|
logging.basicConfig(
|
|
|
|
|
|
level=logging.INFO,
|
|
|
|
|
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
|
|
|
|
|
)
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
|
|
|
app = Flask(__name__)
|
|
|
|
|
|
|
|
|
|
|
|
# 全局模型实例
|
|
|
|
|
|
vl_pipeline = None
|
|
|
|
|
|
ocr_pipeline = None
|
|
|
|
|
|
|
|
|
|
|
|
# 添加项目根目录到 Python 路径
|
|
|
|
|
|
PROJECT_ROOT = Path(__file__).parent.parent
|
|
|
|
|
|
sys.path.insert(0, str(PROJECT_ROOT))
|
|
|
|
|
|
|
|
|
|
|
|
logger.info(f"项目根目录: {PROJECT_ROOT}")
|
|
|
|
|
|
|
|
|
|
|
|
# Force local model usage to avoid user cache / auto-download
|
|
|
|
|
|
MODEL_ROOT = Path(__file__).parent / "models"
|
|
|
|
|
|
|
|
|
|
|
|
|
2026-03-16 16:34:15 +08:00
|
|
|
|
def _find_model_dir(base: Path, *parts: str, contains: str, requires_inference: bool = False, recursive: bool = False):
|
2026-03-16 11:57:07 +08:00
|
|
|
|
"""Find the first model directory that matches a name fragment."""
|
|
|
|
|
|
try:
|
|
|
|
|
|
root = base.joinpath(*parts)
|
|
|
|
|
|
if not root.exists():
|
|
|
|
|
|
return None
|
2026-03-16 16:34:15 +08:00
|
|
|
|
contains_lower = contains.lower()
|
|
|
|
|
|
it = root.rglob("*") if recursive else root.iterdir()
|
|
|
|
|
|
for p in it:
|
|
|
|
|
|
if p.is_dir() and contains_lower in p.name.lower():
|
|
|
|
|
|
if requires_inference and not (p / "inference.yml").exists():
|
|
|
|
|
|
continue
|
2026-03-16 11:57:07 +08:00
|
|
|
|
return p
|
|
|
|
|
|
except Exception:
|
|
|
|
|
|
return None
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def init_models():
|
|
|
|
|
|
"""初始化 OCR 模型"""
|
|
|
|
|
|
global vl_pipeline, ocr_pipeline
|
|
|
|
|
|
|
|
|
|
|
|
# Resolve local model paths (if present)
|
2026-03-16 16:34:15 +08:00
|
|
|
|
det_dir = _find_model_dir(
|
|
|
|
|
|
MODEL_ROOT, contains="pp-ocrv5_server_det", requires_inference=True, recursive=True
|
|
|
|
|
|
)
|
|
|
|
|
|
rec_dir = _find_model_dir(
|
|
|
|
|
|
MODEL_ROOT, contains="pp-ocrv5_server_rec", requires_inference=True, recursive=True
|
|
|
|
|
|
)
|
2026-03-16 11:57:07 +08:00
|
|
|
|
cls_dir = _find_model_dir(MODEL_ROOT, "cls", contains="cls_infer")
|
2026-03-16 16:34:15 +08:00
|
|
|
|
layout_dir = _find_model_dir(
|
|
|
|
|
|
MODEL_ROOT, contains="pp-doclayoutv3", requires_inference=True, recursive=True
|
|
|
|
|
|
)
|
|
|
|
|
|
vl_rec_dir = _find_model_dir(
|
|
|
|
|
|
MODEL_ROOT, contains="paddleocr-vl-1.5", requires_inference=True, recursive=True
|
|
|
|
|
|
)
|
2026-03-16 11:57:07 +08:00
|
|
|
|
|
|
|
|
|
|
if det_dir:
|
|
|
|
|
|
logger.info(f"Using local det model: {det_dir}")
|
|
|
|
|
|
if rec_dir:
|
|
|
|
|
|
logger.info(f"Using local rec model: {rec_dir}")
|
|
|
|
|
|
if cls_dir:
|
|
|
|
|
|
logger.info(f"Using local cls model: {cls_dir}")
|
|
|
|
|
|
if layout_dir:
|
|
|
|
|
|
logger.info(f"Using local layout model: {layout_dir}")
|
2026-03-16 16:34:15 +08:00
|
|
|
|
if vl_rec_dir:
|
|
|
|
|
|
logger.info(f"Using local VL model: {vl_rec_dir}")
|
2026-03-16 11:57:07 +08:00
|
|
|
|
|
|
|
|
|
|
# 初始化 PaddleOCRVL
|
|
|
|
|
|
logger.info("=" * 60)
|
|
|
|
|
|
logger.info("正在初始化 PaddleOCRVL...")
|
|
|
|
|
|
logger.info("=" * 60)
|
|
|
|
|
|
try:
|
|
|
|
|
|
vl_kwargs = {
|
|
|
|
|
|
"use_seal_recognition": True,
|
|
|
|
|
|
"use_ocr_for_image_block": True,
|
|
|
|
|
|
"use_layout_detection": True,
|
2026-03-16 16:34:15 +08:00
|
|
|
|
"use_doc_orientation_classify": False,
|
|
|
|
|
|
"use_doc_unwarping": False,
|
2026-03-16 11:57:07 +08:00
|
|
|
|
}
|
|
|
|
|
|
if layout_dir:
|
|
|
|
|
|
vl_kwargs["layout_detection_model_dir"] = str(layout_dir)
|
2026-03-16 16:34:15 +08:00
|
|
|
|
vl_kwargs["layout_detection_model_name"] = "PP-DocLayoutV3"
|
|
|
|
|
|
if vl_rec_dir:
|
|
|
|
|
|
vl_kwargs["vl_rec_model_dir"] = str(vl_rec_dir)
|
|
|
|
|
|
vl_kwargs["vl_rec_model_name"] = "PaddleOCR-VL-1.5-0.9B"
|
2026-03-16 11:57:07 +08:00
|
|
|
|
|
|
|
|
|
|
vl_pipeline = PaddleOCRVL(**vl_kwargs)
|
|
|
|
|
|
logger.info("✅ PaddleOCRVL 初始化成功")
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
logger.error(f"❌ PaddleOCRVL 初始化失败: {e}", exc_info=True)
|
|
|
|
|
|
vl_pipeline = None
|
|
|
|
|
|
|
|
|
|
|
|
# 初始化 PP-OCRv5
|
|
|
|
|
|
logger.info("=" * 60)
|
|
|
|
|
|
logger.info("正在初始化 PP-OCRv5...")
|
|
|
|
|
|
logger.info("=" * 60)
|
|
|
|
|
|
try:
|
|
|
|
|
|
ocr_kwargs = {
|
2026-03-16 16:34:15 +08:00
|
|
|
|
"use_textline_orientation": False,
|
2026-03-16 11:57:07 +08:00
|
|
|
|
"lang": "ch",
|
2026-03-16 16:34:15 +08:00
|
|
|
|
"use_doc_orientation_classify": False,
|
|
|
|
|
|
"use_doc_unwarping": False,
|
2026-03-16 11:57:07 +08:00
|
|
|
|
}
|
|
|
|
|
|
if det_dir:
|
|
|
|
|
|
ocr_kwargs["text_detection_model_dir"] = str(det_dir)
|
|
|
|
|
|
if rec_dir:
|
|
|
|
|
|
ocr_kwargs["text_recognition_model_dir"] = str(rec_dir)
|
|
|
|
|
|
ocr_pipeline = PaddleOCR(**ocr_kwargs)
|
|
|
|
|
|
logger.info("PP-OCRv5 初始化成功")
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
logger.error(f"PP-OCRv5 初始化失败: {e}", exc_info=True)
|
|
|
|
|
|
ocr_pipeline = None
|
|
|
|
|
|
|
|
|
|
|
|
logger.info("=" * 60)
|
|
|
|
|
|
logger.info("模型初始化完成")
|
|
|
|
|
|
logger.info(f"PaddleOCRVL: {'✅ 可用' if vl_pipeline else '❌ 不可用'}")
|
|
|
|
|
|
logger.info(f"PP-OCRv5: {'✅ 可用' if ocr_pipeline else '❌ 不可用'}")
|
|
|
|
|
|
logger.info("=" * 60)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@app.route('/health', methods=['GET'])
|
|
|
|
|
|
def health():
|
|
|
|
|
|
"""健康检查接口"""
|
|
|
|
|
|
return jsonify({
|
|
|
|
|
|
'status': 'ok',
|
|
|
|
|
|
'vl_model': vl_pipeline is not None,
|
|
|
|
|
|
'ocr_model': ocr_pipeline is not None,
|
|
|
|
|
|
'project_root': str(PROJECT_ROOT)
|
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@app.route('/api/ocr/pdf', methods=['POST'])
|
|
|
|
|
|
def ocr_pdf():
|
|
|
|
|
|
"""
|
|
|
|
|
|
处理 PDF 文件的 OCR 请求
|
|
|
|
|
|
|
|
|
|
|
|
请求 JSON 格式:
|
|
|
|
|
|
{
|
|
|
|
|
|
"pdf_path": "/path/to/file.pdf",
|
|
|
|
|
|
"output_dir": "/path/to/output",
|
|
|
|
|
|
"verbose": false // 可选,启用详细输出
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
响应 JSON 格式:
|
|
|
|
|
|
{
|
|
|
|
|
|
"success": true,
|
|
|
|
|
|
"cma": {
|
|
|
|
|
|
"code": "2023000001",
|
|
|
|
|
|
"confidence": 0.95,
|
|
|
|
|
|
"method": "template_matching"
|
|
|
|
|
|
},
|
|
|
|
|
|
"institutions": ["威凯检测技术有限公司"],
|
|
|
|
|
|
"error": null,
|
|
|
|
|
|
// 仅在 verbose=true 时包含:
|
|
|
|
|
|
"steps": { ... },
|
|
|
|
|
|
"performance": { ... }
|
|
|
|
|
|
}
|
|
|
|
|
|
"""
|
|
|
|
|
|
try:
|
|
|
|
|
|
data = request.get_json()
|
|
|
|
|
|
|
|
|
|
|
|
if not data:
|
|
|
|
|
|
return jsonify({'success': False, 'error': 'Invalid JSON'}), 400
|
|
|
|
|
|
|
|
|
|
|
|
pdf_path = data.get('pdf_path')
|
|
|
|
|
|
output_dir = data.get('output_dir', 'output')
|
|
|
|
|
|
verbose = data.get('verbose', False) # 新增:verbose开关
|
|
|
|
|
|
|
|
|
|
|
|
if not pdf_path:
|
|
|
|
|
|
return jsonify({'success': False, 'error': 'pdf_path is required'}), 400
|
|
|
|
|
|
|
|
|
|
|
|
if not os.path.exists(pdf_path):
|
|
|
|
|
|
return jsonify({
|
|
|
|
|
|
'success': False,
|
|
|
|
|
|
'error': f'PDF file not found: {pdf_path}'
|
|
|
|
|
|
}), 404
|
|
|
|
|
|
|
|
|
|
|
|
logger.info("=" * 60)
|
|
|
|
|
|
logger.info(f"处理 PDF: {pdf_path}")
|
|
|
|
|
|
logger.info(f"输出目录: {output_dir}")
|
|
|
|
|
|
logger.info(f"Verbose模式: {'启用' if verbose else '禁用'}")
|
|
|
|
|
|
logger.info("=" * 60)
|
|
|
|
|
|
|
|
|
|
|
|
# 创建输出目录
|
|
|
|
|
|
os.makedirs(output_dir, exist_ok=True)
|
|
|
|
|
|
|
|
|
|
|
|
# 导入处理逻辑(从 test_accuracy_batch_full.py)
|
|
|
|
|
|
try:
|
|
|
|
|
|
from test_accuracy_batch_full import process_single_pdf_standalone
|
|
|
|
|
|
except ImportError as e:
|
|
|
|
|
|
logger.error(f"无法导入 test_accuracy_batch_full: {e}")
|
|
|
|
|
|
return jsonify({
|
|
|
|
|
|
'success': False,
|
|
|
|
|
|
'error': f'Cannot import test_accuracy_batch_full: {e}'
|
|
|
|
|
|
}), 500
|
|
|
|
|
|
|
|
|
|
|
|
# 处理 PDF(传递 verbose 参数)
|
|
|
|
|
|
try:
|
|
|
|
|
|
result = process_single_pdf_standalone(
|
|
|
|
|
|
Path(pdf_path),
|
|
|
|
|
|
Path(output_dir),
|
|
|
|
|
|
ocr_model='paddleocr_vl',
|
|
|
|
|
|
vl_pipeline=vl_pipeline,
|
|
|
|
|
|
verbose=verbose # 新增:传递verbose参数
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
logger.info("✅ 处理成功")
|
|
|
|
|
|
if result.get('cma'):
|
|
|
|
|
|
logger.info(f" CMA: {result['cma'].get('code', 'N/A')}")
|
|
|
|
|
|
logger.info(f" 机构数: {len(result.get('institutions', []))}")
|
|
|
|
|
|
if result.get('institutions'):
|
|
|
|
|
|
logger.info(f" 机构: {result['institutions'][0]}")
|
|
|
|
|
|
logger.info("=" * 60)
|
|
|
|
|
|
|
|
|
|
|
|
# 返回完整结果(包含verbose信息,如果启用)
|
|
|
|
|
|
return jsonify(result)
|
|
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
logger.error(f"PDF 处理失败: {e}", exc_info=True)
|
|
|
|
|
|
return jsonify({
|
|
|
|
|
|
'success': False,
|
|
|
|
|
|
'error': f'PDF processing failed: {str(e)}'
|
|
|
|
|
|
}), 500
|
|
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
logger.error(f"❌ 请求处理失败: {e}", exc_info=True)
|
|
|
|
|
|
return jsonify({
|
|
|
|
|
|
'success': False,
|
|
|
|
|
|
'error': str(e)
|
|
|
|
|
|
}), 500
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@app.route('/api/ocr/image', methods=['POST'])
|
|
|
|
|
|
def ocr_image():
|
|
|
|
|
|
"""
|
|
|
|
|
|
处理单个图像的 OCR 请求(用于印章识别)
|
|
|
|
|
|
|
|
|
|
|
|
请求 JSON 格式:
|
|
|
|
|
|
{
|
|
|
|
|
|
"image_path": "/path/to/image.png"
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
响应 JSON 格式:
|
|
|
|
|
|
{
|
|
|
|
|
|
"success": true,
|
|
|
|
|
|
"text": "识别的文本内容"
|
|
|
|
|
|
}
|
|
|
|
|
|
"""
|
|
|
|
|
|
try:
|
|
|
|
|
|
data = request.get_json()
|
|
|
|
|
|
image_path = data.get('image_path')
|
|
|
|
|
|
|
|
|
|
|
|
if not image_path:
|
|
|
|
|
|
return jsonify({'success': False, 'error': 'image_path is required'}), 400
|
|
|
|
|
|
|
|
|
|
|
|
if not os.path.exists(image_path):
|
|
|
|
|
|
return jsonify({'success': False, 'error': f'Image not found: {image_path}'}), 404
|
|
|
|
|
|
|
|
|
|
|
|
logger.info(f"处理图像: {image_path}")
|
|
|
|
|
|
|
|
|
|
|
|
# 使用 PaddleOCRVL 识别
|
|
|
|
|
|
if not vl_pipeline:
|
|
|
|
|
|
return jsonify({'success': False, 'error': 'PaddleOCRVL not initialized'}), 500
|
|
|
|
|
|
|
|
|
|
|
|
result = vl_pipeline.ocr(image_path)
|
|
|
|
|
|
|
|
|
|
|
|
# 提取文本
|
|
|
|
|
|
texts = []
|
|
|
|
|
|
if result and len(result) > 0:
|
|
|
|
|
|
for line in result[0]:
|
|
|
|
|
|
if line and len(line) > 0:
|
|
|
|
|
|
texts.append(line[0][0])
|
|
|
|
|
|
|
|
|
|
|
|
text = ' '.join(texts)
|
|
|
|
|
|
logger.info(f"识别文本: {text}")
|
|
|
|
|
|
|
|
|
|
|
|
return jsonify({
|
|
|
|
|
|
'success': True,
|
|
|
|
|
|
'text': text
|
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
logger.error(f"❌ 图像识别失败: {e}", exc_info=True)
|
|
|
|
|
|
return jsonify({'success': False, 'error': str(e)}), 500
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@app.errorhandler(404)
|
|
|
|
|
|
def not_found(error):
|
|
|
|
|
|
"""处理 404 错误"""
|
|
|
|
|
|
return jsonify({
|
|
|
|
|
|
'success': False,
|
|
|
|
|
|
'error': 'Endpoint not found'
|
|
|
|
|
|
}), 404
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@app.errorhandler(500)
|
|
|
|
|
|
def internal_error(error):
|
|
|
|
|
|
"""处理 500 错误"""
|
|
|
|
|
|
return jsonify({
|
|
|
|
|
|
'success': False,
|
|
|
|
|
|
'error': 'Internal server error'
|
|
|
|
|
|
}), 500
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
|
# 初始化模型
|
|
|
|
|
|
init_models()
|
|
|
|
|
|
|
|
|
|
|
|
# 启动服务
|
|
|
|
|
|
port = int(os.environ.get('PORT', 8081))
|
|
|
|
|
|
host = os.environ.get('HOST', '0.0.0.0')
|
|
|
|
|
|
|
|
|
|
|
|
logger.info("=" * 60)
|
|
|
|
|
|
logger.info("Flask OCR API 服务器启动")
|
|
|
|
|
|
logger.info(f"地址: http://{host}:{port}")
|
|
|
|
|
|
logger.info("=" * 60)
|
|
|
|
|
|
|
|
|
|
|
|
app.run(host=host, port=port, threaded=True)
|