report-detect/archive/ocr_tests/investigate_seal_3.py

171 lines
4.9 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Investigation script for 3.pdf seal recognition issue.
"""
import sys
from pathlib import Path
from paddleocr import PaddleOCR
def test_seal_recognition():
"""Test OCR recognition on the unwarp seal image."""
print("=" * 80)
print("3.pdf 印章识别调查")
print("=" * 80)
# Path to the unwarp seal image
seal_path = Path("test_reports_full/3.pdf/seal_unwarp_0.png")
if not seal_path.exists():
print(f"错误:印章图像不存在: {seal_path}")
return False
print(f"\n印章图像: {seal_path}")
print(f"文件大小: {seal_path.stat().st_size} bytes")
# Initialize PaddleOCR
print("\n初始化 PaddleOCR...")
ocr = PaddleOCR(use_angle_cls=True, lang='ch')
# Run OCR on unwarp image
print("\n识别解扭曲印章图像...")
result = ocr.predict(str(seal_path))
if result and len(result) > 0 and result[0]:
print(f"\n识别到 {len(result[0])} 个文本块:")
all_text = []
for i, line in enumerate(result[0]):
box = line[0]
text_info = line[1]
# text_info might be a string or a list
if isinstance(text_info, list):
text = text_info[0]
confidence = text_info[1] if len(text_info) > 1 else 0.0
else:
text = str(text_info)
confidence = 0.0
print(f"\n文本块 {i+1}:")
print(f" 文字: '{text}'")
print(f" 置信度: {confidence:.4f}")
print(f" 位置: {box}")
all_text.append(text)
combined_text = ''.join(all_text)
print(f"\n合并后的文字: '{combined_text}'")
print(f"文字长度: {len(combined_text)}")
# Compare with what's expected
expected = "深圳市中安质量检验认证有限公司"
print(f"\n期望文字: '{expected}'")
# Check if any part matches
if "市场监督管理局" in combined_text:
print("\n⚠️ 发现问题:识别结果包含'市场监督管理局',但应该识别印章中的机构名称")
if "检验认证" in combined_text or "检验" in combined_text:
print("\n✓ 识别结果包含'检验'相关文字")
return True
else:
print("未识别到任何文本")
return False
def test_crop_image():
"""Test OCR on the original crop image."""
print("\n" + "=" * 80)
print("测试原始印章裁剪图像")
print("=" * 80)
crop_path = Path("test_reports_full/3.pdf/seal_crop_0.png")
if not crop_path.exists():
print(f"错误:裁剪图像不存在: {crop_path}")
return False
print(f"\n裁剪图像: {crop_path}")
# Initialize PaddleOCR
ocr = PaddleOCR(use_angle_cls=True, lang='ch')
# Run OCR
print("识别裁剪印章图像...")
result = ocr.predict(str(crop_path))
if result and len(result) > 0 and result[0]:
print(f"\n识别到 {len(result[0])} 个文本块:")
all_text = []
for i, line in enumerate(result[0]):
text_info = line[1]
# text_info might be a string or a list
if isinstance(text_info, list):
text = text_info[0]
confidence = text_info[1] if len(text_info) > 1 else 0.0
else:
text = str(text_info)
confidence = 0.0
print(f" 文字 {i+1}: '{text}' (置信度: {confidence:.4f})")
all_text.append(text)
combined_text = ''.join(all_text)
print(f"\n合并文字: '{combined_text}'")
return True
else:
print("未识别到任何文本")
return False
def check_html_report():
"""Check what the HTML report says."""
print("\n" + "=" * 80)
print("检查HTML报告")
print("=" * 80)
html_path = Path("test_reports_full/3.pdf/index.html")
if not html_path.exists():
print(f"错误HTML报告不存在: {html_path}")
return False
# Read and parse HTML
content = html_path.read_text(encoding='utf-8')
# Look for institution info
import re
# Find extracted institution
extracted_match = re.search(r'Extracted Institution.*?<div class="value">(.*?)</div>', content, re.DOTALL)
if extracted_match:
extracted = extracted_match.group(1).strip()
print(f"\n报告中的提取结果:\n '{extracted}'")
# Find seal recognized text
seal_match = re.search(r'Recognized Text:</strong>(.*?)</p>', content, re.DOTALL)
if seal_match:
seal_text = seal_match.group(1).strip()
print(f"\n报告中的印章识别文字:\n '{seal_text}'")
return True
if __name__ == "__main__":
print("\n开始调查3.pdf印章识别问题...\n")
# Test all three
test_seal_recognition()
test_crop_image()
check_html_report()
print("\n" + "=" * 80)
print("调查完成")
print("=" * 80)