361 lines
11 KiB
Python
361 lines
11 KiB
Python
"""
|
||
PaddleOCR REST API Server
|
||
用于 Java 后端调用的 OCR 服务
|
||
|
||
功能:
|
||
- 封装 PaddleOCRVL 和 PP-OCRv5
|
||
- 提供 PDF 处理接口
|
||
- 提供图像识别接口
|
||
- 健康检查接口
|
||
"""
|
||
|
||
import os
|
||
import sys
|
||
import json
|
||
import logging
|
||
import traceback
|
||
from pathlib import Path
|
||
from flask import Flask, request, jsonify
|
||
from paddleocr import PaddleOCR, PaddleOCRVL
|
||
|
||
# 配置日志
|
||
logging.basicConfig(
|
||
level=logging.INFO,
|
||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
||
)
|
||
logger = logging.getLogger(__name__)
|
||
|
||
app = Flask(__name__)
|
||
|
||
# 全局模型实例
|
||
vl_pipeline = None
|
||
ocr_pipeline = None
|
||
|
||
# 添加项目根目录到 Python 路径
|
||
PROJECT_ROOT = Path(__file__).parent.parent
|
||
sys.path.insert(0, str(PROJECT_ROOT))
|
||
|
||
logger.info(f"项目根目录: {PROJECT_ROOT}")
|
||
|
||
# Force local model usage to avoid user cache / auto-download
|
||
MODEL_ROOT = Path(__file__).parent / "models"
|
||
|
||
|
||
def _find_model_dir(base: Path, *parts: str, contains: str, requires_inference: bool = False, recursive: bool = False):
|
||
"""Find the first model directory that matches a name fragment."""
|
||
try:
|
||
root = base.joinpath(*parts)
|
||
if not root.exists():
|
||
return None
|
||
contains_lower = contains.lower()
|
||
it = root.rglob("*") if recursive else root.iterdir()
|
||
for p in it:
|
||
if p.is_dir() and contains_lower in p.name.lower():
|
||
if requires_inference and not (p / "inference.yml").exists():
|
||
continue
|
||
return p
|
||
except Exception:
|
||
return None
|
||
return None
|
||
|
||
|
||
def init_models():
|
||
"""初始化 OCR 模型"""
|
||
global vl_pipeline, ocr_pipeline
|
||
|
||
# Resolve local model paths (if present)
|
||
det_dir = _find_model_dir(
|
||
MODEL_ROOT, contains="pp-ocrv5_server_det", requires_inference=True, recursive=True
|
||
)
|
||
rec_dir = _find_model_dir(
|
||
MODEL_ROOT, contains="pp-ocrv5_server_rec", requires_inference=True, recursive=True
|
||
)
|
||
cls_dir = _find_model_dir(MODEL_ROOT, "cls", contains="cls_infer")
|
||
layout_dir = _find_model_dir(
|
||
MODEL_ROOT, contains="pp-doclayoutv3", requires_inference=True, recursive=True
|
||
)
|
||
vl_rec_dir = _find_model_dir(
|
||
MODEL_ROOT, contains="paddleocr-vl-1.5", requires_inference=True, recursive=True
|
||
)
|
||
|
||
if det_dir:
|
||
logger.info(f"Using local det model: {det_dir}")
|
||
if rec_dir:
|
||
logger.info(f"Using local rec model: {rec_dir}")
|
||
if cls_dir:
|
||
logger.info(f"Using local cls model: {cls_dir}")
|
||
if layout_dir:
|
||
logger.info(f"Using local layout model: {layout_dir}")
|
||
if vl_rec_dir:
|
||
logger.info(f"Using local VL model: {vl_rec_dir}")
|
||
|
||
# 初始化 PaddleOCRVL
|
||
logger.info("=" * 60)
|
||
logger.info("正在初始化 PaddleOCRVL...")
|
||
logger.info("=" * 60)
|
||
try:
|
||
vl_kwargs = {
|
||
"use_seal_recognition": True,
|
||
"use_ocr_for_image_block": True,
|
||
"use_layout_detection": True,
|
||
"use_doc_orientation_classify": False,
|
||
"use_doc_unwarping": False,
|
||
}
|
||
if layout_dir:
|
||
vl_kwargs["layout_detection_model_dir"] = str(layout_dir)
|
||
vl_kwargs["layout_detection_model_name"] = "PP-DocLayoutV3"
|
||
if vl_rec_dir:
|
||
vl_kwargs["vl_rec_model_dir"] = str(vl_rec_dir)
|
||
vl_kwargs["vl_rec_model_name"] = "PaddleOCR-VL-1.5-0.9B"
|
||
|
||
vl_pipeline = PaddleOCRVL(**vl_kwargs)
|
||
logger.info("✅ PaddleOCRVL 初始化成功")
|
||
except Exception as e:
|
||
logger.error(f"❌ PaddleOCRVL 初始化失败: {e}", exc_info=True)
|
||
vl_pipeline = None
|
||
|
||
# 初始化 PP-OCRv5
|
||
logger.info("=" * 60)
|
||
logger.info("正在初始化 PP-OCRv5...")
|
||
logger.info("=" * 60)
|
||
try:
|
||
ocr_kwargs = {
|
||
"use_textline_orientation": False,
|
||
"lang": "ch",
|
||
"use_doc_orientation_classify": False,
|
||
"use_doc_unwarping": False,
|
||
}
|
||
if det_dir:
|
||
ocr_kwargs["text_detection_model_dir"] = str(det_dir)
|
||
if rec_dir:
|
||
ocr_kwargs["text_recognition_model_dir"] = str(rec_dir)
|
||
ocr_pipeline = PaddleOCR(**ocr_kwargs)
|
||
logger.info("PP-OCRv5 初始化成功")
|
||
except Exception as e:
|
||
logger.error(f"PP-OCRv5 初始化失败: {e}", exc_info=True)
|
||
ocr_pipeline = None
|
||
|
||
logger.info("=" * 60)
|
||
logger.info("模型初始化完成")
|
||
logger.info(f"PaddleOCRVL: {'✅ 可用' if vl_pipeline else '❌ 不可用'}")
|
||
logger.info(f"PP-OCRv5: {'✅ 可用' if ocr_pipeline else '❌ 不可用'}")
|
||
logger.info("=" * 60)
|
||
|
||
|
||
@app.route('/health', methods=['GET'])
|
||
def health():
|
||
"""健康检查接口"""
|
||
return jsonify({
|
||
'status': 'ok',
|
||
'vl_model': vl_pipeline is not None,
|
||
'ocr_model': ocr_pipeline is not None,
|
||
'project_root': str(PROJECT_ROOT)
|
||
})
|
||
|
||
|
||
@app.route('/api/ocr/pdf', methods=['POST'])
|
||
def ocr_pdf():
|
||
"""
|
||
处理 PDF 文件的 OCR 请求
|
||
|
||
请求 JSON 格式:
|
||
{
|
||
"pdf_path": "/path/to/file.pdf",
|
||
"output_dir": "/path/to/output",
|
||
"verbose": false // 可选,启用详细输出
|
||
}
|
||
|
||
响应 JSON 格式:
|
||
{
|
||
"success": true,
|
||
"cma": {
|
||
"code": "2023000001",
|
||
"confidence": 0.95,
|
||
"method": "template_matching"
|
||
},
|
||
"institutions": ["威凯检测技术有限公司"],
|
||
"error": null,
|
||
// 仅在 verbose=true 时包含:
|
||
"steps": { ... },
|
||
"performance": { ... }
|
||
}
|
||
"""
|
||
try:
|
||
data = request.get_json()
|
||
|
||
if not data:
|
||
return jsonify({'success': False, 'error': 'Invalid JSON'}), 400
|
||
|
||
pdf_path = data.get('pdf_path')
|
||
output_dir = data.get('output_dir', 'output')
|
||
verbose = data.get('verbose', False) # 新增:verbose开关
|
||
|
||
if not pdf_path:
|
||
return jsonify({'success': False, 'error': 'pdf_path is required'}), 400
|
||
|
||
if not os.path.exists(pdf_path):
|
||
return jsonify({
|
||
'success': False,
|
||
'error': f'PDF file not found: {pdf_path}'
|
||
}), 404
|
||
|
||
logger.info("=" * 60)
|
||
logger.info(f"处理 PDF: {pdf_path}")
|
||
logger.info(f"输出目录: {output_dir}")
|
||
logger.info(f"Verbose模式: {'启用' if verbose else '禁用'}")
|
||
logger.info("=" * 60)
|
||
|
||
# 创建输出目录
|
||
os.makedirs(output_dir, exist_ok=True)
|
||
|
||
# 导入处理逻辑(从 test_accuracy_batch_full.py)
|
||
try:
|
||
from test_accuracy_batch_full import process_single_pdf_standalone
|
||
except ImportError as e:
|
||
logger.error(f"无法导入 test_accuracy_batch_full: {e}")
|
||
return jsonify({
|
||
'success': False,
|
||
'error': f'Cannot import test_accuracy_batch_full: {e}'
|
||
}), 500
|
||
|
||
# 处理 PDF(传递 verbose 参数)
|
||
try:
|
||
result = process_single_pdf_standalone(
|
||
Path(pdf_path),
|
||
Path(output_dir),
|
||
ocr_model='paddleocr_vl',
|
||
vl_pipeline=vl_pipeline,
|
||
verbose=verbose # 新增:传递verbose参数
|
||
)
|
||
|
||
# Normalize response fields for Java client compatibility
|
||
if isinstance(result, dict):
|
||
if "cma_code" not in result:
|
||
cma_obj = result.get("cma") or {}
|
||
if isinstance(cma_obj, dict):
|
||
result["cma_code"] = cma_obj.get("code")
|
||
if "confidence" not in result and cma_obj.get("confidence") is not None:
|
||
result["confidence"] = cma_obj.get("confidence")
|
||
if "institution_name" not in result:
|
||
insts = result.get("institutions") or []
|
||
if isinstance(insts, list) and len(insts) > 0:
|
||
result["institution_name"] = insts[0]
|
||
|
||
logger.info("✅ 处理成功")
|
||
if result.get('cma'):
|
||
logger.info(f" CMA: {result['cma'].get('code', 'N/A')}")
|
||
logger.info(f" 机构数: {len(result.get('institutions', []))}")
|
||
if result.get('institutions'):
|
||
logger.info(f" 机构: {result['institutions'][0]}")
|
||
logger.info("=" * 60)
|
||
|
||
# 返回完整结果(包含verbose信息,如果启用)
|
||
return jsonify(result)
|
||
|
||
except Exception as e:
|
||
tb = traceback.format_exc()
|
||
logger.error(f"PDF 处理失败: {e}\n{tb}")
|
||
return jsonify({
|
||
'success': False,
|
||
'error': f'PDF processing failed: {str(e)}',
|
||
'traceback': tb
|
||
}), 500
|
||
|
||
except Exception as e:
|
||
tb = traceback.format_exc()
|
||
logger.error(f"❌ 请求处理失败: {e}\n{tb}")
|
||
return jsonify({
|
||
'success': False,
|
||
'error': str(e),
|
||
'traceback': tb
|
||
}), 500
|
||
|
||
|
||
@app.route('/api/ocr/image', methods=['POST'])
|
||
def ocr_image():
|
||
"""
|
||
处理单个图像的 OCR 请求(用于印章识别)
|
||
|
||
请求 JSON 格式:
|
||
{
|
||
"image_path": "/path/to/image.png"
|
||
}
|
||
|
||
响应 JSON 格式:
|
||
{
|
||
"success": true,
|
||
"text": "识别的文本内容"
|
||
}
|
||
"""
|
||
try:
|
||
data = request.get_json()
|
||
image_path = data.get('image_path')
|
||
|
||
if not image_path:
|
||
return jsonify({'success': False, 'error': 'image_path is required'}), 400
|
||
|
||
if not os.path.exists(image_path):
|
||
return jsonify({'success': False, 'error': f'Image not found: {image_path}'}), 404
|
||
|
||
logger.info(f"处理图像: {image_path}")
|
||
|
||
# 使用 PaddleOCRVL 识别
|
||
if not vl_pipeline:
|
||
return jsonify({'success': False, 'error': 'PaddleOCRVL not initialized'}), 500
|
||
|
||
result = vl_pipeline.ocr(image_path)
|
||
|
||
# 提取文本
|
||
texts = []
|
||
if result and len(result) > 0:
|
||
for line in result[0]:
|
||
if line and len(line) > 0:
|
||
texts.append(line[0][0])
|
||
|
||
text = ' '.join(texts)
|
||
logger.info(f"识别文本: {text}")
|
||
|
||
return jsonify({
|
||
'success': True,
|
||
'text': text
|
||
})
|
||
|
||
except Exception as e:
|
||
tb = traceback.format_exc()
|
||
logger.error(f"❌ 图像识别失败: {e}\n{tb}")
|
||
return jsonify({'success': False, 'error': str(e), 'traceback': tb}), 500
|
||
|
||
|
||
@app.errorhandler(404)
|
||
def not_found(error):
|
||
"""处理 404 错误"""
|
||
return jsonify({
|
||
'success': False,
|
||
'error': 'Endpoint not found'
|
||
}), 404
|
||
|
||
|
||
@app.errorhandler(500)
|
||
def internal_error(error):
|
||
"""处理 500 错误"""
|
||
return jsonify({
|
||
'success': False,
|
||
'error': 'Internal server error'
|
||
}), 500
|
||
|
||
|
||
if __name__ == '__main__':
|
||
# 初始化模型
|
||
init_models()
|
||
|
||
# 启动服务
|
||
port = int(os.environ.get('PORT', 8081))
|
||
host = os.environ.get('HOST', '0.0.0.0')
|
||
|
||
logger.info("=" * 60)
|
||
logger.info("Flask OCR API 服务器启动")
|
||
logger.info(f"地址: http://{host}:{port}")
|
||
logger.info("=" * 60)
|
||
|
||
app.run(host=host, port=port, threaded=True)
|