report-detect/archive/ocr_tests/investigate_seal_3.py

171 lines
4.9 KiB
Python
Raw Permalink Normal View History

chore(project): conservative cleanup - archive temp scripts and old docs Major cleanup to improve project organization and maintainability. Changes: - Moved 34 temp/debug/test scripts to archive/temp_scripts/ - Moved 9 auxiliary tools to archive/tools/ - Moved 3 CRT test scripts to archive/crt_tests/ - Moved 4 OCR test scripts to archive/ocr_tests/ - Moved 14 old documentation files to archive/docs/ - Deleted 4 useless files (duplicates, temp files) Root directory: - Before: 67 files (cluttered) - After: 10 core files (clean and organized) Core files retained: - test_accuracy_batch_full.py (main script) - cma_extraction_template_primary.py (CMA extraction) - cma_extraction_final.py (backup CMA extraction) - CLAUDE.md (project guide) - TEST_ACCURACY_BATCH_README.md (usage guide) - TEST_ACCURACY_BATCH_DEPENDENCIES.md (dependency docs) - CLEANUP_PLAN.md (cleanup plan) - CLEANUP_SUMMARY.md (this file) - IMPLEMENTATION_SUMMARY.md (implementation summary) - requirements.txt (dependencies) Archive structure: archive/ ├── temp_scripts/ (34 files: test_, debug_, analyze_, etc.) ├── tools/ (9 files: find_, show_, visualize_, etc.) ├── crt_tests/ (3 files: CRT extraction tests) ├── ocr_tests/ (4 files: OCR timeout tests) └── docs/ (14 files: old reports and guides) Benefits: ✓ Cleaner root directory - easier navigation ✓ Better organization - clear separation of concerns ✓ Preserved history - all files archived, not deleted ✓ Improved maintainability - easier to find active files ✓ Better git history - removed 198 deleted files from tracking No functional changes - all core functionality preserved. Related: - TEST_ACCURACY_BATCH_DEPENDENCIES.md - dependency analysis - CLEANUP_PLAN.md - detailed cleanup plan Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-03 14:35:06 +08:00
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Investigation script for 3.pdf seal recognition issue.
"""
import sys
from pathlib import Path
from paddleocr import PaddleOCR
def test_seal_recognition():
"""Test OCR recognition on the unwarp seal image."""
print("=" * 80)
print("3.pdf 印章识别调查")
print("=" * 80)
# Path to the unwarp seal image
seal_path = Path("test_reports_full/3.pdf/seal_unwarp_0.png")
if not seal_path.exists():
print(f"错误:印章图像不存在: {seal_path}")
return False
print(f"\n印章图像: {seal_path}")
print(f"文件大小: {seal_path.stat().st_size} bytes")
# Initialize PaddleOCR
print("\n初始化 PaddleOCR...")
ocr = PaddleOCR(use_angle_cls=True, lang='ch')
# Run OCR on unwarp image
print("\n识别解扭曲印章图像...")
result = ocr.predict(str(seal_path))
if result and len(result) > 0 and result[0]:
print(f"\n识别到 {len(result[0])} 个文本块:")
all_text = []
for i, line in enumerate(result[0]):
box = line[0]
text_info = line[1]
# text_info might be a string or a list
if isinstance(text_info, list):
text = text_info[0]
confidence = text_info[1] if len(text_info) > 1 else 0.0
else:
text = str(text_info)
confidence = 0.0
print(f"\n文本块 {i+1}:")
print(f" 文字: '{text}'")
print(f" 置信度: {confidence:.4f}")
print(f" 位置: {box}")
all_text.append(text)
combined_text = ''.join(all_text)
print(f"\n合并后的文字: '{combined_text}'")
print(f"文字长度: {len(combined_text)}")
# Compare with what's expected
expected = "深圳市中安质量检验认证有限公司"
print(f"\n期望文字: '{expected}'")
# Check if any part matches
if "市场监督管理局" in combined_text:
print("\n⚠️ 发现问题:识别结果包含'市场监督管理局',但应该识别印章中的机构名称")
if "检验认证" in combined_text or "检验" in combined_text:
print("\n✓ 识别结果包含'检验'相关文字")
return True
else:
print("未识别到任何文本")
return False
def test_crop_image():
"""Test OCR on the original crop image."""
print("\n" + "=" * 80)
print("测试原始印章裁剪图像")
print("=" * 80)
crop_path = Path("test_reports_full/3.pdf/seal_crop_0.png")
if not crop_path.exists():
print(f"错误:裁剪图像不存在: {crop_path}")
return False
print(f"\n裁剪图像: {crop_path}")
# Initialize PaddleOCR
ocr = PaddleOCR(use_angle_cls=True, lang='ch')
# Run OCR
print("识别裁剪印章图像...")
result = ocr.predict(str(crop_path))
if result and len(result) > 0 and result[0]:
print(f"\n识别到 {len(result[0])} 个文本块:")
all_text = []
for i, line in enumerate(result[0]):
text_info = line[1]
# text_info might be a string or a list
if isinstance(text_info, list):
text = text_info[0]
confidence = text_info[1] if len(text_info) > 1 else 0.0
else:
text = str(text_info)
confidence = 0.0
print(f" 文字 {i+1}: '{text}' (置信度: {confidence:.4f})")
all_text.append(text)
combined_text = ''.join(all_text)
print(f"\n合并文字: '{combined_text}'")
return True
else:
print("未识别到任何文本")
return False
def check_html_report():
"""Check what the HTML report says."""
print("\n" + "=" * 80)
print("检查HTML报告")
print("=" * 80)
html_path = Path("test_reports_full/3.pdf/index.html")
if not html_path.exists():
print(f"错误HTML报告不存在: {html_path}")
return False
# Read and parse HTML
content = html_path.read_text(encoding='utf-8')
# Look for institution info
import re
# Find extracted institution
extracted_match = re.search(r'Extracted Institution.*?<div class="value">(.*?)</div>', content, re.DOTALL)
if extracted_match:
extracted = extracted_match.group(1).strip()
print(f"\n报告中的提取结果:\n '{extracted}'")
# Find seal recognized text
seal_match = re.search(r'Recognized Text:</strong>(.*?)</p>', content, re.DOTALL)
if seal_match:
seal_text = seal_match.group(1).strip()
print(f"\n报告中的印章识别文字:\n '{seal_text}'")
return True
if __name__ == "__main__":
print("\n开始调查3.pdf印章识别问题...\n")
# Test all three
test_seal_recognition()
test_crop_image()
check_html_report()
print("\n" + "=" * 80)
print("调查完成")
print("=" * 80)