171 lines
4.9 KiB
Python
171 lines
4.9 KiB
Python
|
|
#!/usr/bin/env python
|
|||
|
|
# -*- coding: utf-8 -*-
|
|||
|
|
"""
|
|||
|
|
Investigation script for 3.pdf seal recognition issue.
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
import sys
|
|||
|
|
from pathlib import Path
|
|||
|
|
from paddleocr import PaddleOCR
|
|||
|
|
|
|||
|
|
def test_seal_recognition():
|
|||
|
|
"""Test OCR recognition on the unwarp seal image."""
|
|||
|
|
print("=" * 80)
|
|||
|
|
print("3.pdf 印章识别调查")
|
|||
|
|
print("=" * 80)
|
|||
|
|
|
|||
|
|
# Path to the unwarp seal image
|
|||
|
|
seal_path = Path("test_reports_full/3.pdf/seal_unwarp_0.png")
|
|||
|
|
|
|||
|
|
if not seal_path.exists():
|
|||
|
|
print(f"错误:印章图像不存在: {seal_path}")
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
print(f"\n印章图像: {seal_path}")
|
|||
|
|
print(f"文件大小: {seal_path.stat().st_size} bytes")
|
|||
|
|
|
|||
|
|
# Initialize PaddleOCR
|
|||
|
|
print("\n初始化 PaddleOCR...")
|
|||
|
|
ocr = PaddleOCR(use_angle_cls=True, lang='ch')
|
|||
|
|
|
|||
|
|
# Run OCR on unwarp image
|
|||
|
|
print("\n识别解扭曲印章图像...")
|
|||
|
|
result = ocr.predict(str(seal_path))
|
|||
|
|
|
|||
|
|
if result and len(result) > 0 and result[0]:
|
|||
|
|
print(f"\n识别到 {len(result[0])} 个文本块:")
|
|||
|
|
|
|||
|
|
all_text = []
|
|||
|
|
for i, line in enumerate(result[0]):
|
|||
|
|
box = line[0]
|
|||
|
|
text_info = line[1]
|
|||
|
|
|
|||
|
|
# text_info might be a string or a list
|
|||
|
|
if isinstance(text_info, list):
|
|||
|
|
text = text_info[0]
|
|||
|
|
confidence = text_info[1] if len(text_info) > 1 else 0.0
|
|||
|
|
else:
|
|||
|
|
text = str(text_info)
|
|||
|
|
confidence = 0.0
|
|||
|
|
|
|||
|
|
print(f"\n文本块 {i+1}:")
|
|||
|
|
print(f" 文字: '{text}'")
|
|||
|
|
print(f" 置信度: {confidence:.4f}")
|
|||
|
|
print(f" 位置: {box}")
|
|||
|
|
|
|||
|
|
all_text.append(text)
|
|||
|
|
|
|||
|
|
combined_text = ''.join(all_text)
|
|||
|
|
print(f"\n合并后的文字: '{combined_text}'")
|
|||
|
|
print(f"文字长度: {len(combined_text)}")
|
|||
|
|
|
|||
|
|
# Compare with what's expected
|
|||
|
|
expected = "深圳市中安质量检验认证有限公司"
|
|||
|
|
print(f"\n期望文字: '{expected}'")
|
|||
|
|
|
|||
|
|
# Check if any part matches
|
|||
|
|
if "市场监督管理局" in combined_text:
|
|||
|
|
print("\n⚠️ 发现问题:识别结果包含'市场监督管理局',但应该识别印章中的机构名称")
|
|||
|
|
|
|||
|
|
if "检验认证" in combined_text or "检验" in combined_text:
|
|||
|
|
print("\n✓ 识别结果包含'检验'相关文字")
|
|||
|
|
|
|||
|
|
return True
|
|||
|
|
else:
|
|||
|
|
print("未识别到任何文本")
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
|
|||
|
|
def test_crop_image():
|
|||
|
|
"""Test OCR on the original crop image."""
|
|||
|
|
print("\n" + "=" * 80)
|
|||
|
|
print("测试原始印章裁剪图像")
|
|||
|
|
print("=" * 80)
|
|||
|
|
|
|||
|
|
crop_path = Path("test_reports_full/3.pdf/seal_crop_0.png")
|
|||
|
|
|
|||
|
|
if not crop_path.exists():
|
|||
|
|
print(f"错误:裁剪图像不存在: {crop_path}")
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
print(f"\n裁剪图像: {crop_path}")
|
|||
|
|
|
|||
|
|
# Initialize PaddleOCR
|
|||
|
|
ocr = PaddleOCR(use_angle_cls=True, lang='ch')
|
|||
|
|
|
|||
|
|
# Run OCR
|
|||
|
|
print("识别裁剪印章图像...")
|
|||
|
|
result = ocr.predict(str(crop_path))
|
|||
|
|
|
|||
|
|
if result and len(result) > 0 and result[0]:
|
|||
|
|
print(f"\n识别到 {len(result[0])} 个文本块:")
|
|||
|
|
|
|||
|
|
all_text = []
|
|||
|
|
for i, line in enumerate(result[0]):
|
|||
|
|
text_info = line[1]
|
|||
|
|
|
|||
|
|
# text_info might be a string or a list
|
|||
|
|
if isinstance(text_info, list):
|
|||
|
|
text = text_info[0]
|
|||
|
|
confidence = text_info[1] if len(text_info) > 1 else 0.0
|
|||
|
|
else:
|
|||
|
|
text = str(text_info)
|
|||
|
|
confidence = 0.0
|
|||
|
|
|
|||
|
|
print(f" 文字 {i+1}: '{text}' (置信度: {confidence:.4f})")
|
|||
|
|
all_text.append(text)
|
|||
|
|
|
|||
|
|
combined_text = ''.join(all_text)
|
|||
|
|
print(f"\n合并文字: '{combined_text}'")
|
|||
|
|
|
|||
|
|
return True
|
|||
|
|
else:
|
|||
|
|
print("未识别到任何文本")
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
|
|||
|
|
def check_html_report():
|
|||
|
|
"""Check what the HTML report says."""
|
|||
|
|
print("\n" + "=" * 80)
|
|||
|
|
print("检查HTML报告")
|
|||
|
|
print("=" * 80)
|
|||
|
|
|
|||
|
|
html_path = Path("test_reports_full/3.pdf/index.html")
|
|||
|
|
|
|||
|
|
if not html_path.exists():
|
|||
|
|
print(f"错误:HTML报告不存在: {html_path}")
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
# Read and parse HTML
|
|||
|
|
content = html_path.read_text(encoding='utf-8')
|
|||
|
|
|
|||
|
|
# Look for institution info
|
|||
|
|
import re
|
|||
|
|
|
|||
|
|
# Find extracted institution
|
|||
|
|
extracted_match = re.search(r'Extracted Institution.*?<div class="value">(.*?)</div>', content, re.DOTALL)
|
|||
|
|
if extracted_match:
|
|||
|
|
extracted = extracted_match.group(1).strip()
|
|||
|
|
print(f"\n报告中的提取结果:\n '{extracted}'")
|
|||
|
|
|
|||
|
|
# Find seal recognized text
|
|||
|
|
seal_match = re.search(r'Recognized Text:</strong>(.*?)</p>', content, re.DOTALL)
|
|||
|
|
if seal_match:
|
|||
|
|
seal_text = seal_match.group(1).strip()
|
|||
|
|
print(f"\n报告中的印章识别文字:\n '{seal_text}'")
|
|||
|
|
|
|||
|
|
return True
|
|||
|
|
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
print("\n开始调查3.pdf印章识别问题...\n")
|
|||
|
|
|
|||
|
|
# Test all three
|
|||
|
|
test_seal_recognition()
|
|||
|
|
test_crop_image()
|
|||
|
|
check_html_report()
|
|||
|
|
|
|||
|
|
print("\n" + "=" * 80)
|
|||
|
|
print("调查完成")
|
|||
|
|
print("=" * 80)
|