report-detect/python_api/ocr_api_server.py

"""
PaddleOCR REST API Server
用于 Java 后端调用的 OCR 服务

功能：
- 封装 PaddleOCRVL 和 PP-OCRv5
- 提供 PDF 处理接口
- 提供图像识别接口
- 健康检查接口
"""

import os
import sys
import json
import logging
import traceback
from pathlib import Path
from flask import Flask, request, jsonify
from paddleocr import PaddleOCR
try:
    from paddleocr import PaddleOCRVL  # type: ignore
except Exception:
    PaddleOCRVL = None

# 配置日志
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

app = Flask(__name__)

# 全局模型实例
vl_pipeline = None
ocr_pipeline = None

# 添加项目根目录到 Python 路径
PROJECT_ROOT = Path(__file__).parent.parent
sys.path.insert(0, str(PROJECT_ROOT))

logger.info(f"项目根目录: {PROJECT_ROOT}")

# Force local model usage to avoid user cache / auto-download
MODEL_ROOT = Path(__file__).parent / "models"


def _find_model_dir(base: Path, *parts: str, contains: str, requires_inference: bool = False, recursive: bool = False):
    """Find the first model directory that matches a name fragment."""
    try:
        root = base.joinpath(*parts)
        if not root.exists():
            return None
        contains_lower = contains.lower()
        it = root.rglob("*") if recursive else root.iterdir()
        for p in it:
            if p.is_dir() and contains_lower in p.name.lower():
                if requires_inference and not (p / "inference.yml").exists():
                    continue
                return p
    except Exception:
        return None
    return None


def init_models():
    """初始化 OCR 模型"""
    global vl_pipeline, ocr_pipeline

    # Resolve local model paths (if present)
    det_dir = _find_model_dir(
        MODEL_ROOT, contains="pp-ocrv5_server_det", requires_inference=True, recursive=True
    )
    rec_dir = _find_model_dir(
        MODEL_ROOT, contains="pp-ocrv5_server_rec", requires_inference=True, recursive=True
    )
    cls_dir = _find_model_dir(MODEL_ROOT, "cls", contains="cls_infer")
    layout_dir = _find_model_dir(
        MODEL_ROOT, contains="pp-doclayoutv3", requires_inference=True, recursive=True
    )
    vl_rec_dir = _find_model_dir(
        MODEL_ROOT, contains="paddleocr-vl-1.5", requires_inference=True, recursive=True
    )

    if det_dir:
        logger.info(f"Using local det model: {det_dir}")
    if rec_dir:
        logger.info(f"Using local rec model: {rec_dir}")
    if cls_dir:
        logger.info(f"Using local cls model: {cls_dir}")
    if layout_dir:
        logger.info(f"Using local layout model: {layout_dir}")
    if vl_rec_dir:
        logger.info(f"Using local VL model: {vl_rec_dir}")

    # 初始化 PaddleOCRVL
    logger.info("=" * 60)
    logger.info("正在初始化 PaddleOCRVL...")
    logger.info("=" * 60)
    if PaddleOCRVL is None:
        logger.warning("PaddleOCRVL not available in installed paddleocr. Skipping VL pipeline.")
        vl_pipeline = None
    else:
        try:
            vl_kwargs = {
                "use_seal_recognition": True,
                "use_ocr_for_image_block": True,
                "use_layout_detection": True,
                "use_doc_orientation_classify": False,
                "use_doc_unwarping": False,
            }
            if layout_dir:
                vl_kwargs["layout_detection_model_dir"] = str(layout_dir)
                vl_kwargs["layout_detection_model_name"] = "PP-DocLayoutV3"
            if vl_rec_dir:
                vl_kwargs["vl_rec_model_dir"] = str(vl_rec_dir)
                vl_kwargs["vl_rec_model_name"] = "PaddleOCR-VL-1.5-0.9B"

            vl_pipeline = PaddleOCRVL(**vl_kwargs)
            logger.info("✅ PaddleOCRVL 初始化成功")
        except Exception as e:
            logger.error(f"❌ PaddleOCRVL 初始化失败: {e}", exc_info=True)
            vl_pipeline = None

    # 初始化 PP-OCRv5
    logger.info("=" * 60)
    logger.info("正在初始化 PP-OCRv5...")
    logger.info("=" * 60)
    try:
        ocr_kwargs = {
            "use_textline_orientation": False,
            "lang": "ch",
            "use_doc_orientation_classify": False,
            "use_doc_unwarping": False,
        }
        if det_dir:
            ocr_kwargs["text_detection_model_dir"] = str(det_dir)
        if rec_dir:
            ocr_kwargs["text_recognition_model_dir"] = str(rec_dir)
        ocr_pipeline = PaddleOCR(**ocr_kwargs)
        logger.info("PP-OCRv5 初始化成功")
    except Exception as e:
        logger.error(f"PP-OCRv5 初始化失败: {e}", exc_info=True)
        ocr_pipeline = None

    logger.info("=" * 60)
    logger.info("模型初始化完成")
    logger.info(f"PaddleOCRVL: {'✅ 可用' if vl_pipeline else '❌ 不可用'}")
    logger.info(f"PP-OCRv5: {'✅ 可用' if ocr_pipeline else '❌ 不可用'}")
    logger.info("=" * 60)


@app.route('/health', methods=['GET'])
def health():
    """健康检查接口"""
    return jsonify({
        'status': 'ok',
        'vl_model': vl_pipeline is not None,
        'ocr_model': ocr_pipeline is not None,
        'project_root': str(PROJECT_ROOT)
    })


@app.route('/api/ocr/pdf', methods=['POST'])
def ocr_pdf():
    """
    处理 PDF 文件的 OCR 请求

    请求 JSON 格式：
    {
        "pdf_path": "/path/to/file.pdf",
        "output_dir": "/path/to/output",
        "verbose": false  // 可选，启用详细输出
    }

    响应 JSON 格式：
    {
        "success": true,
        "cma": {
            "code": "2023000001",
            "confidence": 0.95,
            "method": "template_matching"
        },
        "institutions": ["威凯检测技术有限公司"],
        "error": null,
        // 仅在 verbose=true 时包含：
        "steps": { ... },
        "performance": { ... }
    }
    """
    try:
        data = request.get_json()

        if not data:
            return jsonify({'success': False, 'error': 'Invalid JSON'}), 400

        pdf_path = data.get('pdf_path')
        output_dir = data.get('output_dir', 'output')
        verbose = data.get('verbose', False)  # 新增：verbose开关

        if not pdf_path:
            return jsonify({'success': False, 'error': 'pdf_path is required'}), 400

        if not os.path.exists(pdf_path):
            return jsonify({
                'success': False,
                'error': f'PDF file not found: {pdf_path}'
            }), 404

        logger.info("=" * 60)
        logger.info(f"处理 PDF: {pdf_path}")
        logger.info(f"输出目录: {output_dir}")
        logger.info(f"Verbose模式: {'启用' if verbose else '禁用'}")
        logger.info("=" * 60)

        # 创建输出目录
        os.makedirs(output_dir, exist_ok=True)

        # 导入处理逻辑（从 test_accuracy_batch_full.py）
        try:
            from test_accuracy_batch_full import process_single_pdf_standalone
        except ImportError as e:
            logger.error(f"无法导入 test_accuracy_batch_full: {e}")
            return jsonify({
                'success': False,
                'error': f'Cannot import test_accuracy_batch_full: {e}'
            }), 500

        # 处理 PDF（传递 verbose 参数）
        try:
            ocr_model = 'paddleocr_vl' if vl_pipeline else 'ppocr_v5'
            result = process_single_pdf_standalone(
                Path(pdf_path),
                Path(output_dir),
                ocr_model=ocr_model,
                vl_pipeline=vl_pipeline,
                verbose=verbose  # 新增：传递verbose参数
            )

            # Normalize response fields for Java client compatibility
            if isinstance(result, dict):
                if "cma_code" not in result:
                    cma_obj = result.get("cma") or {}
                    if isinstance(cma_obj, dict):
                        result["cma_code"] = cma_obj.get("code")
                        if "confidence" not in result and cma_obj.get("confidence") is not None:
                            result["confidence"] = cma_obj.get("confidence")
                if "institution_name" not in result:
                    insts = result.get("institutions") or []
                    if isinstance(insts, list) and len(insts) > 0:
                        result["institution_name"] = insts[0]

            logger.info("✅ 处理成功")
            if result.get('cma'):
                logger.info(f"  CMA: {result['cma'].get('code', 'N/A')}")
            logger.info(f"  机构数: {len(result.get('institutions', []))}")
            if result.get('institutions'):
                logger.info(f"  机构: {result['institutions'][0]}")
            logger.info("=" * 60)

            # 返回完整结果（包含verbose信息，如果启用）
            return jsonify(result)

        except Exception as e:
            tb = traceback.format_exc()
            logger.error(f"PDF 处理失败: {e}\n{tb}")
            return jsonify({
                'success': False,
                'error': f'PDF processing failed: {str(e)}',
                'traceback': tb
            }), 500

    except Exception as e:
        tb = traceback.format_exc()
        logger.error(f"❌ 请求处理失败: {e}\n{tb}")
        return jsonify({
            'success': False,
            'error': str(e),
            'traceback': tb
        }), 500


@app.route('/api/ocr/image', methods=['POST'])
def ocr_image():
    """
    处理单个图像的 OCR 请求（用于印章识别）

    请求 JSON 格式：
    {
        "image_path": "/path/to/image.png"
    }

    响应 JSON 格式：
    {
        "success": true,
        "text": "识别的文本内容"
    }
    """
    try:
        data = request.get_json()
        image_path = data.get('image_path')

        if not image_path:
            return jsonify({'success': False, 'error': 'image_path is required'}), 400

        if not os.path.exists(image_path):
            return jsonify({'success': False, 'error': f'Image not found: {image_path}'}), 404

        logger.info(f"处理图像: {image_path}")
        # Use PaddleOCRVL if available, otherwise fallback to PaddleOCR
        if vl_pipeline:
            result = vl_pipeline.ocr(image_path)
        else:
            if not ocr_pipeline:
                return jsonify({'success': False, 'error': 'OCR pipeline not initialized'}), 500
            result = ocr_pipeline.ocr(image_path)

        # 提取文本
        texts = []
        if result and len(result) > 0:
            for line in result[0]:
                if line and len(line) > 0:
                    texts.append(line[0][0])

        text = ' '.join(texts)
        logger.info(f"识别文本: {text}")

        return jsonify({
            'success': True,
            'text': text
        })

    except Exception as e:
        tb = traceback.format_exc()
        logger.error(f"❌ 图像识别失败: {e}\n{tb}")
        return jsonify({'success': False, 'error': str(e), 'traceback': tb}), 500


@app.errorhandler(404)
def not_found(error):
    """处理 404 错误"""
    return jsonify({
        'success': False,
        'error': 'Endpoint not found'
    }), 404


@app.errorhandler(500)
def internal_error(error):
    """处理 500 错误"""
    return jsonify({
        'success': False,
        'error': 'Internal server error'
    }), 500


if __name__ == '__main__':
    # 初始化模型
    init_models()

    # 启动服务
    port = int(os.environ.get('PORT', 8081))
    host = os.environ.get('HOST', '0.0.0.0')

    logger.info("=" * 60)
    logger.info("Flask OCR API 服务器启动")
    logger.info(f"地址: http://{host}:{port}")
    logger.info("=" * 60)

    app.run(host=host, port=port, threaded=True)