""" PaddleOCR REST API Server 用于 Java 后端调用的 OCR 服务 功能: - 封装 PaddleOCRVL 和 PP-OCRv5 - 提供 PDF 处理接口 - 提供图像识别接口 - 健康检查接口 """ import os import sys import json import logging import traceback from pathlib import Path from flask import Flask, request, jsonify from paddleocr import PaddleOCR, PaddleOCRVL # 配置日志 logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) app = Flask(__name__) # 全局模型实例 vl_pipeline = None ocr_pipeline = None # 添加项目根目录到 Python 路径 PROJECT_ROOT = Path(__file__).parent.parent sys.path.insert(0, str(PROJECT_ROOT)) logger.info(f"项目根目录: {PROJECT_ROOT}") # Force local model usage to avoid user cache / auto-download MODEL_ROOT = Path(__file__).parent / "models" def _find_model_dir(base: Path, *parts: str, contains: str, requires_inference: bool = False, recursive: bool = False): """Find the first model directory that matches a name fragment.""" try: root = base.joinpath(*parts) if not root.exists(): return None contains_lower = contains.lower() it = root.rglob("*") if recursive else root.iterdir() for p in it: if p.is_dir() and contains_lower in p.name.lower(): if requires_inference and not (p / "inference.yml").exists(): continue return p except Exception: return None return None def init_models(): """初始化 OCR 模型""" global vl_pipeline, ocr_pipeline # Resolve local model paths (if present) det_dir = _find_model_dir( MODEL_ROOT, contains="pp-ocrv5_server_det", requires_inference=True, recursive=True ) rec_dir = _find_model_dir( MODEL_ROOT, contains="pp-ocrv5_server_rec", requires_inference=True, recursive=True ) cls_dir = _find_model_dir(MODEL_ROOT, "cls", contains="cls_infer") layout_dir = _find_model_dir( MODEL_ROOT, contains="pp-doclayoutv3", requires_inference=True, recursive=True ) vl_rec_dir = _find_model_dir( MODEL_ROOT, contains="paddleocr-vl-1.5", requires_inference=True, recursive=True ) if det_dir: logger.info(f"Using local det model: {det_dir}") if rec_dir: logger.info(f"Using local rec model: {rec_dir}") if cls_dir: logger.info(f"Using local cls model: {cls_dir}") if layout_dir: logger.info(f"Using local layout model: {layout_dir}") if vl_rec_dir: logger.info(f"Using local VL model: {vl_rec_dir}") # 初始化 PaddleOCRVL logger.info("=" * 60) logger.info("正在初始化 PaddleOCRVL...") logger.info("=" * 60) try: vl_kwargs = { "use_seal_recognition": True, "use_ocr_for_image_block": True, "use_layout_detection": True, "use_doc_orientation_classify": False, "use_doc_unwarping": False, } if layout_dir: vl_kwargs["layout_detection_model_dir"] = str(layout_dir) vl_kwargs["layout_detection_model_name"] = "PP-DocLayoutV3" if vl_rec_dir: vl_kwargs["vl_rec_model_dir"] = str(vl_rec_dir) vl_kwargs["vl_rec_model_name"] = "PaddleOCR-VL-1.5-0.9B" vl_pipeline = PaddleOCRVL(**vl_kwargs) logger.info("✅ PaddleOCRVL 初始化成功") except Exception as e: logger.error(f"❌ PaddleOCRVL 初始化失败: {e}", exc_info=True) vl_pipeline = None # 初始化 PP-OCRv5 logger.info("=" * 60) logger.info("正在初始化 PP-OCRv5...") logger.info("=" * 60) try: ocr_kwargs = { "use_textline_orientation": False, "lang": "ch", "use_doc_orientation_classify": False, "use_doc_unwarping": False, } if det_dir: ocr_kwargs["text_detection_model_dir"] = str(det_dir) if rec_dir: ocr_kwargs["text_recognition_model_dir"] = str(rec_dir) ocr_pipeline = PaddleOCR(**ocr_kwargs) logger.info("PP-OCRv5 初始化成功") except Exception as e: logger.error(f"PP-OCRv5 初始化失败: {e}", exc_info=True) ocr_pipeline = None logger.info("=" * 60) logger.info("模型初始化完成") logger.info(f"PaddleOCRVL: {'✅ 可用' if vl_pipeline else '❌ 不可用'}") logger.info(f"PP-OCRv5: {'✅ 可用' if ocr_pipeline else '❌ 不可用'}") logger.info("=" * 60) @app.route('/health', methods=['GET']) def health(): """健康检查接口""" return jsonify({ 'status': 'ok', 'vl_model': vl_pipeline is not None, 'ocr_model': ocr_pipeline is not None, 'project_root': str(PROJECT_ROOT) }) @app.route('/api/ocr/pdf', methods=['POST']) def ocr_pdf(): """ 处理 PDF 文件的 OCR 请求 请求 JSON 格式: { "pdf_path": "/path/to/file.pdf", "output_dir": "/path/to/output", "verbose": false // 可选,启用详细输出 } 响应 JSON 格式: { "success": true, "cma": { "code": "2023000001", "confidence": 0.95, "method": "template_matching" }, "institutions": ["威凯检测技术有限公司"], "error": null, // 仅在 verbose=true 时包含: "steps": { ... }, "performance": { ... } } """ try: data = request.get_json() if not data: return jsonify({'success': False, 'error': 'Invalid JSON'}), 400 pdf_path = data.get('pdf_path') output_dir = data.get('output_dir', 'output') verbose = data.get('verbose', False) # 新增:verbose开关 if not pdf_path: return jsonify({'success': False, 'error': 'pdf_path is required'}), 400 if not os.path.exists(pdf_path): return jsonify({ 'success': False, 'error': f'PDF file not found: {pdf_path}' }), 404 logger.info("=" * 60) logger.info(f"处理 PDF: {pdf_path}") logger.info(f"输出目录: {output_dir}") logger.info(f"Verbose模式: {'启用' if verbose else '禁用'}") logger.info("=" * 60) # 创建输出目录 os.makedirs(output_dir, exist_ok=True) # 导入处理逻辑(从 test_accuracy_batch_full.py) try: from test_accuracy_batch_full import process_single_pdf_standalone except ImportError as e: logger.error(f"无法导入 test_accuracy_batch_full: {e}") return jsonify({ 'success': False, 'error': f'Cannot import test_accuracy_batch_full: {e}' }), 500 # 处理 PDF(传递 verbose 参数) try: result = process_single_pdf_standalone( Path(pdf_path), Path(output_dir), ocr_model='paddleocr_vl', vl_pipeline=vl_pipeline, verbose=verbose # 新增:传递verbose参数 ) # Normalize response fields for Java client compatibility if isinstance(result, dict): if "cma_code" not in result: cma_obj = result.get("cma") or {} if isinstance(cma_obj, dict): result["cma_code"] = cma_obj.get("code") if "confidence" not in result and cma_obj.get("confidence") is not None: result["confidence"] = cma_obj.get("confidence") if "institution_name" not in result: insts = result.get("institutions") or [] if isinstance(insts, list) and len(insts) > 0: result["institution_name"] = insts[0] logger.info("✅ 处理成功") if result.get('cma'): logger.info(f" CMA: {result['cma'].get('code', 'N/A')}") logger.info(f" 机构数: {len(result.get('institutions', []))}") if result.get('institutions'): logger.info(f" 机构: {result['institutions'][0]}") logger.info("=" * 60) # 返回完整结果(包含verbose信息,如果启用) return jsonify(result) except Exception as e: tb = traceback.format_exc() logger.error(f"PDF 处理失败: {e}\n{tb}") return jsonify({ 'success': False, 'error': f'PDF processing failed: {str(e)}', 'traceback': tb }), 500 except Exception as e: tb = traceback.format_exc() logger.error(f"❌ 请求处理失败: {e}\n{tb}") return jsonify({ 'success': False, 'error': str(e), 'traceback': tb }), 500 @app.route('/api/ocr/image', methods=['POST']) def ocr_image(): """ 处理单个图像的 OCR 请求(用于印章识别) 请求 JSON 格式: { "image_path": "/path/to/image.png" } 响应 JSON 格式: { "success": true, "text": "识别的文本内容" } """ try: data = request.get_json() image_path = data.get('image_path') if not image_path: return jsonify({'success': False, 'error': 'image_path is required'}), 400 if not os.path.exists(image_path): return jsonify({'success': False, 'error': f'Image not found: {image_path}'}), 404 logger.info(f"处理图像: {image_path}") # 使用 PaddleOCRVL 识别 if not vl_pipeline: return jsonify({'success': False, 'error': 'PaddleOCRVL not initialized'}), 500 result = vl_pipeline.ocr(image_path) # 提取文本 texts = [] if result and len(result) > 0: for line in result[0]: if line and len(line) > 0: texts.append(line[0][0]) text = ' '.join(texts) logger.info(f"识别文本: {text}") return jsonify({ 'success': True, 'text': text }) except Exception as e: tb = traceback.format_exc() logger.error(f"❌ 图像识别失败: {e}\n{tb}") return jsonify({'success': False, 'error': str(e), 'traceback': tb}), 500 @app.errorhandler(404) def not_found(error): """处理 404 错误""" return jsonify({ 'success': False, 'error': 'Endpoint not found' }), 404 @app.errorhandler(500) def internal_error(error): """处理 500 错误""" return jsonify({ 'success': False, 'error': 'Internal server error' }), 500 if __name__ == '__main__': # 初始化模型 init_models() # 启动服务 port = int(os.environ.get('PORT', 8081)) host = os.environ.get('HOST', '0.0.0.0') logger.info("=" * 60) logger.info("Flask OCR API 服务器启动") logger.info(f"地址: http://{host}:{port}") logger.info("=" * 60) app.run(host=host, port=port, threaded=True)