report-detect/archive/ocr_tests/investigate_seal_3.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Investigation script for 3.pdf seal recognition issue.
"""

import sys
from pathlib import Path
from paddleocr import PaddleOCR

def test_seal_recognition():
    """Test OCR recognition on the unwarp seal image."""
    print("=" * 80)
    print("3.pdf 印章识别调查")
    print("=" * 80)

    # Path to the unwarp seal image
    seal_path = Path("test_reports_full/3.pdf/seal_unwarp_0.png")

    if not seal_path.exists():
        print(f"错误：印章图像不存在: {seal_path}")
        return False

    print(f"\n印章图像: {seal_path}")
    print(f"文件大小: {seal_path.stat().st_size} bytes")

    # Initialize PaddleOCR
    print("\n初始化 PaddleOCR...")
    ocr = PaddleOCR(use_angle_cls=True, lang='ch')

    # Run OCR on unwarp image
    print("\n识别解扭曲印章图像...")
    result = ocr.predict(str(seal_path))

    if result and len(result) > 0 and result[0]:
        print(f"\n识别到 {len(result[0])} 个文本块:")

        all_text = []
        for i, line in enumerate(result[0]):
            box = line[0]
            text_info = line[1]

            # text_info might be a string or a list
            if isinstance(text_info, list):
                text = text_info[0]
                confidence = text_info[1] if len(text_info) > 1 else 0.0
            else:
                text = str(text_info)
                confidence = 0.0

            print(f"\n文本块 {i+1}:")
            print(f"  文字: '{text}'")
            print(f"  置信度: {confidence:.4f}")
            print(f"  位置: {box}")

            all_text.append(text)

        combined_text = ''.join(all_text)
        print(f"\n合并后的文字: '{combined_text}'")
        print(f"文字长度: {len(combined_text)}")

        # Compare with what's expected
        expected = "深圳市中安质量检验认证有限公司"
        print(f"\n期望文字: '{expected}'")

        # Check if any part matches
        if "市场监督管理局" in combined_text:
            print("\n⚠️ 发现问题：识别结果包含'市场监督管理局'，但应该识别印章中的机构名称")

        if "检验认证" in combined_text or "检验" in combined_text:
            print("\n✓ 识别结果包含'检验'相关文字")

        return True
    else:
        print("未识别到任何文本")
        return False


def test_crop_image():
    """Test OCR on the original crop image."""
    print("\n" + "=" * 80)
    print("测试原始印章裁剪图像")
    print("=" * 80)

    crop_path = Path("test_reports_full/3.pdf/seal_crop_0.png")

    if not crop_path.exists():
        print(f"错误：裁剪图像不存在: {crop_path}")
        return False

    print(f"\n裁剪图像: {crop_path}")

    # Initialize PaddleOCR
    ocr = PaddleOCR(use_angle_cls=True, lang='ch')

    # Run OCR
    print("识别裁剪印章图像...")
    result = ocr.predict(str(crop_path))

    if result and len(result) > 0 and result[0]:
        print(f"\n识别到 {len(result[0])} 个文本块:")

        all_text = []
        for i, line in enumerate(result[0]):
            text_info = line[1]

            # text_info might be a string or a list
            if isinstance(text_info, list):
                text = text_info[0]
                confidence = text_info[1] if len(text_info) > 1 else 0.0
            else:
                text = str(text_info)
                confidence = 0.0

            print(f"  文字 {i+1}: '{text}' (置信度: {confidence:.4f})")
            all_text.append(text)

        combined_text = ''.join(all_text)
        print(f"\n合并文字: '{combined_text}'")

        return True
    else:
        print("未识别到任何文本")
        return False


def check_html_report():
    """Check what the HTML report says."""
    print("\n" + "=" * 80)
    print("检查HTML报告")
    print("=" * 80)

    html_path = Path("test_reports_full/3.pdf/index.html")

    if not html_path.exists():
        print(f"错误：HTML报告不存在: {html_path}")
        return False

    # Read and parse HTML
    content = html_path.read_text(encoding='utf-8')

    # Look for institution info
    import re

    # Find extracted institution
    extracted_match = re.search(r'Extracted Institution.*?<div class="value">(.*?)</div>', content, re.DOTALL)
    if extracted_match:
        extracted = extracted_match.group(1).strip()
        print(f"\n报告中的提取结果:\n  '{extracted}'")

    # Find seal recognized text
    seal_match = re.search(r'Recognized Text:</strong>(.*?)</p>', content, re.DOTALL)
    if seal_match:
        seal_text = seal_match.group(1).strip()
        print(f"\n报告中的印章识别文字:\n  '{seal_text}'")

    return True


if __name__ == "__main__":
    print("\n开始调查3.pdf印章识别问题...\n")

    # Test all three
    test_seal_recognition()
    test_crop_image()
    check_html_report()

    print("\n" + "=" * 80)
    print("调查完成")
    print("=" * 80)
-												chore(project): conservative cleanup - archive temp scripts and old docs

Major cleanup to improve project organization and maintainability.

Changes:
- Moved 34 temp/debug/test scripts to archive/temp_scripts/
- Moved 9 auxiliary tools to archive/tools/
- Moved 3 CRT test scripts to archive/crt_tests/
- Moved 4 OCR test scripts to archive/ocr_tests/
- Moved 14 old documentation files to archive/docs/
- Deleted 4 useless files (duplicates, temp files)

Root directory:
- Before: 67 files (cluttered)
- After: 10 core files (clean and organized)

Core files retained:
- test_accuracy_batch_full.py (main script)
- cma_extraction_template_primary.py (CMA extraction)
- cma_extraction_final.py (backup CMA extraction)
- CLAUDE.md (project guide)
- TEST_ACCURACY_BATCH_README.md (usage guide)
- TEST_ACCURACY_BATCH_DEPENDENCIES.md (dependency docs)
- CLEANUP_PLAN.md (cleanup plan)
- CLEANUP_SUMMARY.md (this file)
- IMPLEMENTATION_SUMMARY.md (implementation summary)
- requirements.txt (dependencies)

Archive structure:
archive/
├── temp_scripts/ (34 files: test_, debug_, analyze_, etc.)
├── tools/ (9 files: find_, show_, visualize_, etc.)
├── crt_tests/ (3 files: CRT extraction tests)
├── ocr_tests/ (4 files: OCR timeout tests)
└── docs/ (14 files: old reports and guides)

Benefits:
✓ Cleaner root directory - easier navigation
✓ Better organization - clear separation of concerns
✓ Preserved history - all files archived, not deleted
✓ Improved maintainability - easier to find active files
✓ Better git history - removed 198 deleted files from tracking

No functional changes - all core functionality preserved.

Related:
- TEST_ACCURACY_BATCH_DEPENDENCIES.md - dependency analysis
- CLEANUP_PLAN.md - detailed cleanup plan

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

											
										
										
											2026-03-03 14:35:06 +08:00
+								#!/usr/bin/env python
 								# -*- coding: utf-8 -*-
 								"""
 								Investigation script for 3.pdf seal recognition issue.
 								"""
 								import sys
 								from pathlib import Path
 								from paddleocr import PaddleOCR
 								def test_seal_recognition():
 								    """Test OCR recognition on the unwarp seal image."""
 								    print("=" * 80)
 								    print("3.pdf 印章识别调查")
 								    print("=" * 80)
 								    # Path to the unwarp seal image
 								    seal_path = Path("test_reports_full/3.pdf/seal_unwarp_0.png")
 								    if not seal_path.exists():
 								        print(f"错误：印章图像不存在: {seal_path}")
 								        return False
 								    print(f"\n印章图像: {seal_path}")
 								    print(f"文件大小: {seal_path.stat().st_size} bytes")
 								    # Initialize PaddleOCR
 								    print("\n初始化 PaddleOCR...")
 								    ocr = PaddleOCR(use_angle_cls=True, lang='ch')
 								    # Run OCR on unwarp image
 								    print("\n识别解扭曲印章图像...")
 								    result = ocr.predict(str(seal_path))
 								    if result and len(result) > 0 and result[0]:
 								        print(f"\n识别到 {len(result[0])} 个文本块:")
 								        all_text = []
 								        for i, line in enumerate(result[0]):
 								            box = line[0]
 								            text_info = line[1]
 								            # text_info might be a string or a list
 								            if isinstance(text_info, list):
 								                text = text_info[0]
 								                confidence = text_info[1] if len(text_info) > 1 else 0.0
 								            else:
 								                text = str(text_info)
 								                confidence = 0.0
 								            print(f"\n文本块 {i+1}:")
 								            print(f"  文字: '{text}'")
 								            print(f"  置信度: {confidence:.4f}")
 								            print(f"  位置: {box}")
 								            all_text.append(text)
 								        combined_text = ''.join(all_text)
 								        print(f"\n合并后的文字: '{combined_text}'")
 								        print(f"文字长度: {len(combined_text)}")
 								        # Compare with what's expected
 								        expected = "深圳市中安质量检验认证有限公司"
 								        print(f"\n期望文字: '{expected}'")
 								        # Check if any part matches
 								        if "市场监督管理局" in combined_text:
 								            print("\n⚠️ 发现问题：识别结果包含'市场监督管理局'，但应该识别印章中的机构名称")
 								        if "检验认证" in combined_text or "检验" in combined_text:
 								            print("\n✓ 识别结果包含'检验'相关文字")
 								        return True
 								    else:
 								        print("未识别到任何文本")
 								        return False
 								def test_crop_image():
 								    """Test OCR on the original crop image."""
 								    print("\n" + "=" * 80)
 								    print("测试原始印章裁剪图像")
 								    print("=" * 80)
 								    crop_path = Path("test_reports_full/3.pdf/seal_crop_0.png")
 								    if not crop_path.exists():
 								        print(f"错误：裁剪图像不存在: {crop_path}")
 								        return False
 								    print(f"\n裁剪图像: {crop_path}")
 								    # Initialize PaddleOCR
 								    ocr = PaddleOCR(use_angle_cls=True, lang='ch')
 								    # Run OCR
 								    print("识别裁剪印章图像...")
 								    result = ocr.predict(str(crop_path))
 								    if result and len(result) > 0 and result[0]:
 								        print(f"\n识别到 {len(result[0])} 个文本块:")
 								        all_text = []
 								        for i, line in enumerate(result[0]):
 								            text_info = line[1]
 								            # text_info might be a string or a list
 								            if isinstance(text_info, list):
 								                text = text_info[0]
 								                confidence = text_info[1] if len(text_info) > 1 else 0.0
 								            else:
 								                text = str(text_info)
 								                confidence = 0.0
 								            print(f"  文字 {i+1}: '{text}' (置信度: {confidence:.4f})")
 								            all_text.append(text)
 								        combined_text = ''.join(all_text)
 								        print(f"\n合并文字: '{combined_text}'")
 								        return True
 								    else:
 								        print("未识别到任何文本")
 								        return False
 								def check_html_report():
 								    """Check what the HTML report says."""
 								    print("\n" + "=" * 80)
 								    print("检查HTML报告")
 								    print("=" * 80)
 								    html_path = Path("test_reports_full/3.pdf/index.html")
 								    if not html_path.exists():
 								        print(f"错误：HTML报告不存在: {html_path}")
 								        return False
 								    # Read and parse HTML
 								    content = html_path.read_text(encoding='utf-8')
 								    # Look for institution info
 								    import re
 								    # Find extracted institution
 								    extracted_match = re.search(r'Extracted Institution.*?<div class="value">(.*?)</div>', content, re.DOTALL)
 								    if extracted_match:
 								        extracted = extracted_match.group(1).strip()
 								        print(f"\n报告中的提取结果:\n  '{extracted}'")
 								    # Find seal recognized text
 								    seal_match = re.search(r'Recognized Text:</strong>(.*?)</p>', content, re.DOTALL)
 								    if seal_match:
 								        seal_text = seal_match.group(1).strip()
 								        print(f"\n报告中的印章识别文字:\n  '{seal_text}'")
 								    return True
 								if __name__ == "__main__":
 								    print("\n开始调查3.pdf印章识别问题...\n")
 								    # Test all three
 								    test_seal_recognition()
 								    test_crop_image()
 								    check_html_report()
 								    print("\n" + "=" * 80)
 								    print("调查完成")
 								    print("=" * 80)