report-detect/python_api/ocr_api_server.py

395 lines
13 KiB
Python
Raw Permalink Normal View History

2026-03-16 11:57:07 +08:00
"""
PaddleOCR REST API Server
用于 Java 后端调用的 OCR 服务
功能
- 封装 PaddleOCRVL PP-OCRv5
- 提供 PDF 处理接口
- 提供图像识别接口
- 健康检查接口
"""
import os
import sys
import json
import logging
import traceback
2026-03-16 11:57:07 +08:00
from pathlib import Path
from flask import Flask, request, jsonify
2026-03-19 15:02:01 +08:00
from paddleocr import PaddleOCR
try:
from paddleocr import PaddleOCRVL # type: ignore
except Exception:
PaddleOCRVL = None
2026-03-16 11:57:07 +08:00
# 配置日志
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
app = Flask(__name__)
# 全局模型实例
vl_pipeline = None
ocr_pipeline = None
# 添加项目根目录到 Python 路径
PROJECT_ROOT = Path(__file__).parent.parent
sys.path.insert(0, str(PROJECT_ROOT))
logger.info(f"项目根目录: {PROJECT_ROOT}")
# Force local model usage to avoid user cache / auto-download
_env_model_root = os.environ.get("PADDLEOCR_HOME") or os.environ.get("HUB_HOME")
MODEL_ROOT = Path(_env_model_root) if _env_model_root else (Path(__file__).parent / "models")
os.environ.setdefault("PADDLEOCR_HOME", str(MODEL_ROOT))
os.environ.setdefault("HUB_HOME", str(MODEL_ROOT))
os.environ.setdefault("DISABLE_MODEL_SOURCE_CHECK", "True")
os.environ.setdefault("PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK", "True")
os.environ.setdefault("HUB_DISABLE_MODEL_SOURCE_CHECK", "True")
os.environ.setdefault("PADDLEHUB_NO_FETCH_LATEST", "True")
2026-03-16 11:57:07 +08:00
def _find_model_dir(base: Path, *parts: str, contains: str, requires_inference: bool = False, recursive: bool = False):
2026-03-16 11:57:07 +08:00
"""Find the first model directory that matches a name fragment."""
try:
root = base.joinpath(*parts)
if not root.exists():
return None
contains_lower = contains.lower()
it = root.rglob("*") if recursive else root.iterdir()
for p in it:
if p.is_dir() and contains_lower in p.name.lower():
if requires_inference and not ((p / "inference.yml").exists() or (p / "inference.onnx").exists()):
continue
2026-03-16 11:57:07 +08:00
return p
except Exception:
return None
return None
def init_models():
"""初始化 OCR 模型"""
global vl_pipeline, ocr_pipeline
# Resolve local model paths (if present)
det_dir = _find_model_dir(
MODEL_ROOT, contains="pp-ocrv5_server_det", requires_inference=True, recursive=True
)
rec_dir = _find_model_dir(
MODEL_ROOT, contains="pp-ocrv5_server_rec", requires_inference=True, recursive=True
)
2026-03-16 11:57:07 +08:00
cls_dir = _find_model_dir(MODEL_ROOT, "cls", contains="cls_infer")
layout_dir = _find_model_dir(
MODEL_ROOT, contains="pp-doclayoutv3", requires_inference=True, recursive=True
)
vl_rec_dir = _find_model_dir(
MODEL_ROOT, contains="paddleocr-vl-1.5", requires_inference=True, recursive=True
)
2026-03-16 11:57:07 +08:00
if det_dir:
logger.info(f"Using local det model: {det_dir}")
if rec_dir:
logger.info(f"Using local rec model: {rec_dir}")
if cls_dir:
logger.info(f"Using local cls model: {cls_dir}")
if layout_dir:
logger.info(f"Using local layout model: {layout_dir}")
if vl_rec_dir:
logger.info(f"Using local VL model: {vl_rec_dir}")
2026-03-16 11:57:07 +08:00
if not det_dir or not rec_dir:
logger.error(f"Local OCR models not found under {MODEL_ROOT}. Offline mode requires local models.")
raise RuntimeError("Local OCR models not found for offline mode")
2026-03-16 11:57:07 +08:00
# 初始化 PaddleOCRVL
logger.info("=" * 60)
logger.info("正在初始化 PaddleOCRVL...")
logger.info("=" * 60)
2026-03-19 15:02:01 +08:00
if PaddleOCRVL is None:
logger.warning("PaddleOCRVL not available in installed paddleocr. Skipping VL pipeline.")
2026-03-16 11:57:07 +08:00
vl_pipeline = None
2026-03-19 15:02:01 +08:00
else:
try:
vl_kwargs = {
"use_seal_recognition": True,
"use_ocr_for_image_block": True,
"use_layout_detection": True,
"use_doc_orientation_classify": False,
"use_doc_unwarping": False,
}
if layout_dir:
vl_kwargs["layout_detection_model_dir"] = str(layout_dir)
vl_kwargs["layout_detection_model_name"] = "PP-DocLayoutV3"
if vl_rec_dir:
vl_kwargs["vl_rec_model_dir"] = str(vl_rec_dir)
vl_kwargs["vl_rec_model_name"] = "PaddleOCR-VL-1.5-0.9B"
vl_pipeline = PaddleOCRVL(**vl_kwargs)
logger.info("✅ PaddleOCRVL 初始化成功")
except Exception as e:
logger.error(f"❌ PaddleOCRVL 初始化失败: {e}", exc_info=True)
vl_pipeline = None
2026-03-16 11:57:07 +08:00
# 初始化 PP-OCRv5
logger.info("=" * 60)
logger.info("正在初始化 PP-OCRv5...")
logger.info("=" * 60)
try:
use_onnx = False
if det_dir and (Path(det_dir) / "inference.onnx").exists():
use_onnx = True
if rec_dir and (Path(rec_dir) / "inference.onnx").exists():
use_onnx = True
if cls_dir and (Path(cls_dir) / "inference.onnx").exists():
use_onnx = True
2026-03-16 11:57:07 +08:00
ocr_kwargs = {
"use_textline_orientation": False,
2026-03-16 11:57:07 +08:00
"lang": "ch",
"use_doc_orientation_classify": False,
"use_doc_unwarping": False,
"use_onnx": use_onnx,
2026-03-16 11:57:07 +08:00
}
if det_dir:
ocr_kwargs["det_model_dir"] = str(det_dir)
2026-03-16 11:57:07 +08:00
ocr_kwargs["text_detection_model_dir"] = str(det_dir)
if rec_dir:
ocr_kwargs["rec_model_dir"] = str(rec_dir)
2026-03-16 11:57:07 +08:00
ocr_kwargs["text_recognition_model_dir"] = str(rec_dir)
if cls_dir:
ocr_kwargs["cls_model_dir"] = str(cls_dir)
2026-03-16 11:57:07 +08:00
ocr_pipeline = PaddleOCR(**ocr_kwargs)
logger.info("PP-OCRv5 初始化成功")
except Exception as e:
logger.error(f"PP-OCRv5 初始化失败: {e}", exc_info=True)
ocr_pipeline = None
logger.info("=" * 60)
logger.info("模型初始化完成")
logger.info(f"PaddleOCRVL: {'✅ 可用' if vl_pipeline else '❌ 不可用'}")
logger.info(f"PP-OCRv5: {'✅ 可用' if ocr_pipeline else '❌ 不可用'}")
logger.info("=" * 60)
@app.route('/health', methods=['GET'])
def health():
"""健康检查接口"""
return jsonify({
'status': 'ok',
'vl_model': vl_pipeline is not None,
'ocr_model': ocr_pipeline is not None,
'project_root': str(PROJECT_ROOT)
})
@app.route('/api/ocr/pdf', methods=['POST'])
def ocr_pdf():
"""
处理 PDF 文件的 OCR 请求
请求 JSON 格式
{
"pdf_path": "/path/to/file.pdf",
"output_dir": "/path/to/output",
"verbose": false // 可选启用详细输出
}
响应 JSON 格式
{
"success": true,
"cma": {
"code": "2023000001",
"confidence": 0.95,
"method": "template_matching"
},
"institutions": ["威凯检测技术有限公司"],
"error": null,
// 仅在 verbose=true 时包含
"steps": { ... },
"performance": { ... }
}
"""
try:
data = request.get_json()
if not data:
return jsonify({'success': False, 'error': 'Invalid JSON'}), 400
pdf_path = data.get('pdf_path')
output_dir = data.get('output_dir', 'output')
verbose = data.get('verbose', False) # 新增verbose开关
if not pdf_path:
return jsonify({'success': False, 'error': 'pdf_path is required'}), 400
if not os.path.exists(pdf_path):
return jsonify({
'success': False,
'error': f'PDF file not found: {pdf_path}'
}), 404
logger.info("=" * 60)
logger.info(f"处理 PDF: {pdf_path}")
logger.info(f"输出目录: {output_dir}")
logger.info(f"Verbose模式: {'启用' if verbose else '禁用'}")
logger.info("=" * 60)
# 创建输出目录
os.makedirs(output_dir, exist_ok=True)
# 导入处理逻辑(从 test_accuracy_batch_full.py
try:
from test_accuracy_batch_full import process_single_pdf_standalone
except ImportError as e:
logger.error(f"无法导入 test_accuracy_batch_full: {e}")
return jsonify({
'success': False,
'error': f'Cannot import test_accuracy_batch_full: {e}'
}), 500
# 处理 PDF传递 verbose 参数)
try:
2026-03-19 15:02:01 +08:00
ocr_model = 'paddleocr_vl' if vl_pipeline else 'ppocr_v5'
2026-03-16 11:57:07 +08:00
result = process_single_pdf_standalone(
Path(pdf_path),
Path(output_dir),
2026-03-19 15:02:01 +08:00
ocr_model=ocr_model,
2026-03-16 11:57:07 +08:00
vl_pipeline=vl_pipeline,
verbose=verbose # 新增传递verbose参数
)
# Normalize response fields for Java client compatibility
if isinstance(result, dict):
if "cma_code" not in result:
cma_obj = result.get("cma") or {}
if isinstance(cma_obj, dict):
result["cma_code"] = cma_obj.get("code")
if "confidence" not in result and cma_obj.get("confidence") is not None:
result["confidence"] = cma_obj.get("confidence")
if "institution_name" not in result:
insts = result.get("institutions") or []
if isinstance(insts, list) and len(insts) > 0:
result["institution_name"] = insts[0]
2026-03-16 11:57:07 +08:00
logger.info("✅ 处理成功")
if result.get('cma'):
logger.info(f" CMA: {result['cma'].get('code', 'N/A')}")
logger.info(f" 机构数: {len(result.get('institutions', []))}")
if result.get('institutions'):
logger.info(f" 机构: {result['institutions'][0]}")
logger.info("=" * 60)
# 返回完整结果包含verbose信息如果启用
return jsonify(result)
except Exception as e:
tb = traceback.format_exc()
logger.error(f"PDF 处理失败: {e}\n{tb}")
2026-03-16 11:57:07 +08:00
return jsonify({
'success': False,
'error': f'PDF processing failed: {str(e)}',
'traceback': tb
2026-03-16 11:57:07 +08:00
}), 500
except Exception as e:
tb = traceback.format_exc()
logger.error(f"❌ 请求处理失败: {e}\n{tb}")
2026-03-16 11:57:07 +08:00
return jsonify({
'success': False,
'error': str(e),
'traceback': tb
2026-03-16 11:57:07 +08:00
}), 500
@app.route('/api/ocr/image', methods=['POST'])
def ocr_image():
"""
处理单个图像的 OCR 请求用于印章识别
请求 JSON 格式
{
"image_path": "/path/to/image.png"
}
响应 JSON 格式
{
"success": true,
"text": "识别的文本内容"
}
"""
try:
data = request.get_json()
image_path = data.get('image_path')
if not image_path:
return jsonify({'success': False, 'error': 'image_path is required'}), 400
if not os.path.exists(image_path):
return jsonify({'success': False, 'error': f'Image not found: {image_path}'}), 404
logger.info(f"处理图像: {image_path}")
2026-03-19 15:02:01 +08:00
# Use PaddleOCRVL if available, otherwise fallback to PaddleOCR
if vl_pipeline:
result = vl_pipeline.ocr(image_path)
else:
if not ocr_pipeline:
return jsonify({'success': False, 'error': 'OCR pipeline not initialized'}), 500
result = ocr_pipeline.ocr(image_path)
2026-03-16 11:57:07 +08:00
# 提取文本
texts = []
if result and len(result) > 0:
for line in result[0]:
if line and len(line) > 0:
texts.append(line[0][0])
text = ' '.join(texts)
logger.info(f"识别文本: {text}")
return jsonify({
'success': True,
'text': text
})
except Exception as e:
tb = traceback.format_exc()
logger.error(f"❌ 图像识别失败: {e}\n{tb}")
return jsonify({'success': False, 'error': str(e), 'traceback': tb}), 500
2026-03-16 11:57:07 +08:00
@app.errorhandler(404)
def not_found(error):
"""处理 404 错误"""
return jsonify({
'success': False,
'error': 'Endpoint not found'
}), 404
@app.errorhandler(500)
def internal_error(error):
"""处理 500 错误"""
return jsonify({
'success': False,
'error': 'Internal server error'
}), 500
if __name__ == '__main__':
# 初始化模型
init_models()
# 启动服务
port = int(os.environ.get('PORT', 8081))
host = os.environ.get('HOST', '0.0.0.0')
logger.info("=" * 60)
logger.info("Flask OCR API 服务器启动")
logger.info(f"地址: http://{host}:{port}")
logger.info("=" * 60)
app.run(host=host, port=port, threaded=True)