171 lines
4.9 KiB
Python
171 lines
4.9 KiB
Python
#!/usr/bin/env python
|
||
# -*- coding: utf-8 -*-
|
||
"""
|
||
Investigation script for 3.pdf seal recognition issue.
|
||
"""
|
||
|
||
import sys
|
||
from pathlib import Path
|
||
from paddleocr import PaddleOCR
|
||
|
||
def test_seal_recognition():
|
||
"""Test OCR recognition on the unwarp seal image."""
|
||
print("=" * 80)
|
||
print("3.pdf 印章识别调查")
|
||
print("=" * 80)
|
||
|
||
# Path to the unwarp seal image
|
||
seal_path = Path("test_reports_full/3.pdf/seal_unwarp_0.png")
|
||
|
||
if not seal_path.exists():
|
||
print(f"错误:印章图像不存在: {seal_path}")
|
||
return False
|
||
|
||
print(f"\n印章图像: {seal_path}")
|
||
print(f"文件大小: {seal_path.stat().st_size} bytes")
|
||
|
||
# Initialize PaddleOCR
|
||
print("\n初始化 PaddleOCR...")
|
||
ocr = PaddleOCR(use_angle_cls=True, lang='ch')
|
||
|
||
# Run OCR on unwarp image
|
||
print("\n识别解扭曲印章图像...")
|
||
result = ocr.predict(str(seal_path))
|
||
|
||
if result and len(result) > 0 and result[0]:
|
||
print(f"\n识别到 {len(result[0])} 个文本块:")
|
||
|
||
all_text = []
|
||
for i, line in enumerate(result[0]):
|
||
box = line[0]
|
||
text_info = line[1]
|
||
|
||
# text_info might be a string or a list
|
||
if isinstance(text_info, list):
|
||
text = text_info[0]
|
||
confidence = text_info[1] if len(text_info) > 1 else 0.0
|
||
else:
|
||
text = str(text_info)
|
||
confidence = 0.0
|
||
|
||
print(f"\n文本块 {i+1}:")
|
||
print(f" 文字: '{text}'")
|
||
print(f" 置信度: {confidence:.4f}")
|
||
print(f" 位置: {box}")
|
||
|
||
all_text.append(text)
|
||
|
||
combined_text = ''.join(all_text)
|
||
print(f"\n合并后的文字: '{combined_text}'")
|
||
print(f"文字长度: {len(combined_text)}")
|
||
|
||
# Compare with what's expected
|
||
expected = "深圳市中安质量检验认证有限公司"
|
||
print(f"\n期望文字: '{expected}'")
|
||
|
||
# Check if any part matches
|
||
if "市场监督管理局" in combined_text:
|
||
print("\n⚠️ 发现问题:识别结果包含'市场监督管理局',但应该识别印章中的机构名称")
|
||
|
||
if "检验认证" in combined_text or "检验" in combined_text:
|
||
print("\n✓ 识别结果包含'检验'相关文字")
|
||
|
||
return True
|
||
else:
|
||
print("未识别到任何文本")
|
||
return False
|
||
|
||
|
||
def test_crop_image():
|
||
"""Test OCR on the original crop image."""
|
||
print("\n" + "=" * 80)
|
||
print("测试原始印章裁剪图像")
|
||
print("=" * 80)
|
||
|
||
crop_path = Path("test_reports_full/3.pdf/seal_crop_0.png")
|
||
|
||
if not crop_path.exists():
|
||
print(f"错误:裁剪图像不存在: {crop_path}")
|
||
return False
|
||
|
||
print(f"\n裁剪图像: {crop_path}")
|
||
|
||
# Initialize PaddleOCR
|
||
ocr = PaddleOCR(use_angle_cls=True, lang='ch')
|
||
|
||
# Run OCR
|
||
print("识别裁剪印章图像...")
|
||
result = ocr.predict(str(crop_path))
|
||
|
||
if result and len(result) > 0 and result[0]:
|
||
print(f"\n识别到 {len(result[0])} 个文本块:")
|
||
|
||
all_text = []
|
||
for i, line in enumerate(result[0]):
|
||
text_info = line[1]
|
||
|
||
# text_info might be a string or a list
|
||
if isinstance(text_info, list):
|
||
text = text_info[0]
|
||
confidence = text_info[1] if len(text_info) > 1 else 0.0
|
||
else:
|
||
text = str(text_info)
|
||
confidence = 0.0
|
||
|
||
print(f" 文字 {i+1}: '{text}' (置信度: {confidence:.4f})")
|
||
all_text.append(text)
|
||
|
||
combined_text = ''.join(all_text)
|
||
print(f"\n合并文字: '{combined_text}'")
|
||
|
||
return True
|
||
else:
|
||
print("未识别到任何文本")
|
||
return False
|
||
|
||
|
||
def check_html_report():
|
||
"""Check what the HTML report says."""
|
||
print("\n" + "=" * 80)
|
||
print("检查HTML报告")
|
||
print("=" * 80)
|
||
|
||
html_path = Path("test_reports_full/3.pdf/index.html")
|
||
|
||
if not html_path.exists():
|
||
print(f"错误:HTML报告不存在: {html_path}")
|
||
return False
|
||
|
||
# Read and parse HTML
|
||
content = html_path.read_text(encoding='utf-8')
|
||
|
||
# Look for institution info
|
||
import re
|
||
|
||
# Find extracted institution
|
||
extracted_match = re.search(r'Extracted Institution.*?<div class="value">(.*?)</div>', content, re.DOTALL)
|
||
if extracted_match:
|
||
extracted = extracted_match.group(1).strip()
|
||
print(f"\n报告中的提取结果:\n '{extracted}'")
|
||
|
||
# Find seal recognized text
|
||
seal_match = re.search(r'Recognized Text:</strong>(.*?)</p>', content, re.DOTALL)
|
||
if seal_match:
|
||
seal_text = seal_match.group(1).strip()
|
||
print(f"\n报告中的印章识别文字:\n '{seal_text}'")
|
||
|
||
return True
|
||
|
||
|
||
if __name__ == "__main__":
|
||
print("\n开始调查3.pdf印章识别问题...\n")
|
||
|
||
# Test all three
|
||
test_seal_recognition()
|
||
test_crop_image()
|
||
check_html_report()
|
||
|
||
print("\n" + "=" * 80)
|
||
print("调查完成")
|
||
print("=" * 80)
|