report-detect/archive/ocr_tests/investigate_seal_3.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Investigation script for 3.pdf seal recognition issue.
"""

import sys
from pathlib import Path
from paddleocr import PaddleOCR

def test_seal_recognition():
    """Test OCR recognition on the unwarp seal image."""
    print("=" * 80)
    print("3.pdf 印章识别调查")
    print("=" * 80)

    # Path to the unwarp seal image
    seal_path = Path("test_reports_full/3.pdf/seal_unwarp_0.png")

    if not seal_path.exists():
        print(f"错误：印章图像不存在: {seal_path}")
        return False

    print(f"\n印章图像: {seal_path}")
    print(f"文件大小: {seal_path.stat().st_size} bytes")

    # Initialize PaddleOCR
    print("\n初始化 PaddleOCR...")
    ocr = PaddleOCR(use_angle_cls=True, lang='ch')

    # Run OCR on unwarp image
    print("\n识别解扭曲印章图像...")
    result = ocr.predict(str(seal_path))

    if result and len(result) > 0 and result[0]:
        print(f"\n识别到 {len(result[0])} 个文本块:")

        all_text = []
        for i, line in enumerate(result[0]):
            box = line[0]
            text_info = line[1]

            # text_info might be a string or a list
            if isinstance(text_info, list):
                text = text_info[0]
                confidence = text_info[1] if len(text_info) > 1 else 0.0
            else:
                text = str(text_info)
                confidence = 0.0

            print(f"\n文本块 {i+1}:")
            print(f"  文字: '{text}'")
            print(f"  置信度: {confidence:.4f}")
            print(f"  位置: {box}")

            all_text.append(text)

        combined_text = ''.join(all_text)
        print(f"\n合并后的文字: '{combined_text}'")
        print(f"文字长度: {len(combined_text)}")

        # Compare with what's expected
        expected = "深圳市中安质量检验认证有限公司"
        print(f"\n期望文字: '{expected}'")

        # Check if any part matches
        if "市场监督管理局" in combined_text:
            print("\n⚠️ 发现问题：识别结果包含'市场监督管理局'，但应该识别印章中的机构名称")

        if "检验认证" in combined_text or "检验" in combined_text:
            print("\n✓ 识别结果包含'检验'相关文字")

        return True
    else:
        print("未识别到任何文本")
        return False


def test_crop_image():
    """Test OCR on the original crop image."""
    print("\n" + "=" * 80)
    print("测试原始印章裁剪图像")
    print("=" * 80)

    crop_path = Path("test_reports_full/3.pdf/seal_crop_0.png")

    if not crop_path.exists():
        print(f"错误：裁剪图像不存在: {crop_path}")
        return False

    print(f"\n裁剪图像: {crop_path}")

    # Initialize PaddleOCR
    ocr = PaddleOCR(use_angle_cls=True, lang='ch')

    # Run OCR
    print("识别裁剪印章图像...")
    result = ocr.predict(str(crop_path))

    if result and len(result) > 0 and result[0]:
        print(f"\n识别到 {len(result[0])} 个文本块:")

        all_text = []
        for i, line in enumerate(result[0]):
            text_info = line[1]

            # text_info might be a string or a list
            if isinstance(text_info, list):
                text = text_info[0]
                confidence = text_info[1] if len(text_info) > 1 else 0.0
            else:
                text = str(text_info)
                confidence = 0.0

            print(f"  文字 {i+1}: '{text}' (置信度: {confidence:.4f})")
            all_text.append(text)

        combined_text = ''.join(all_text)
        print(f"\n合并文字: '{combined_text}'")

        return True
    else:
        print("未识别到任何文本")
        return False


def check_html_report():
    """Check what the HTML report says."""
    print("\n" + "=" * 80)
    print("检查HTML报告")
    print("=" * 80)

    html_path = Path("test_reports_full/3.pdf/index.html")

    if not html_path.exists():
        print(f"错误：HTML报告不存在: {html_path}")
        return False

    # Read and parse HTML
    content = html_path.read_text(encoding='utf-8')

    # Look for institution info
    import re

    # Find extracted institution
    extracted_match = re.search(r'Extracted Institution.*?<div class="value">(.*?)</div>', content, re.DOTALL)
    if extracted_match:
        extracted = extracted_match.group(1).strip()
        print(f"\n报告中的提取结果:\n  '{extracted}'")

    # Find seal recognized text
    seal_match = re.search(r'Recognized Text:</strong>(.*?)</p>', content, re.DOTALL)
    if seal_match:
        seal_text = seal_match.group(1).strip()
        print(f"\n报告中的印章识别文字:\n  '{seal_text}'")

    return True


if __name__ == "__main__":
    print("\n开始调查3.pdf印章识别问题...\n")

    # Test all three
    test_seal_recognition()
    test_crop_image()
    check_html_report()

    print("\n" + "=" * 80)
    print("调查完成")
    print("=" * 80)