#!/usr/bin/env python # -*- coding: utf-8 -*- """ Investigation script for 3.pdf seal recognition issue. """ import sys from pathlib import Path from paddleocr import PaddleOCR def test_seal_recognition(): """Test OCR recognition on the unwarp seal image.""" print("=" * 80) print("3.pdf 印章识别调查") print("=" * 80) # Path to the unwarp seal image seal_path = Path("test_reports_full/3.pdf/seal_unwarp_0.png") if not seal_path.exists(): print(f"错误:印章图像不存在: {seal_path}") return False print(f"\n印章图像: {seal_path}") print(f"文件大小: {seal_path.stat().st_size} bytes") # Initialize PaddleOCR print("\n初始化 PaddleOCR...") ocr = PaddleOCR(use_angle_cls=True, lang='ch') # Run OCR on unwarp image print("\n识别解扭曲印章图像...") result = ocr.predict(str(seal_path)) if result and len(result) > 0 and result[0]: print(f"\n识别到 {len(result[0])} 个文本块:") all_text = [] for i, line in enumerate(result[0]): box = line[0] text_info = line[1] # text_info might be a string or a list if isinstance(text_info, list): text = text_info[0] confidence = text_info[1] if len(text_info) > 1 else 0.0 else: text = str(text_info) confidence = 0.0 print(f"\n文本块 {i+1}:") print(f" 文字: '{text}'") print(f" 置信度: {confidence:.4f}") print(f" 位置: {box}") all_text.append(text) combined_text = ''.join(all_text) print(f"\n合并后的文字: '{combined_text}'") print(f"文字长度: {len(combined_text)}") # Compare with what's expected expected = "深圳市中安质量检验认证有限公司" print(f"\n期望文字: '{expected}'") # Check if any part matches if "市场监督管理局" in combined_text: print("\n⚠️ 发现问题:识别结果包含'市场监督管理局',但应该识别印章中的机构名称") if "检验认证" in combined_text or "检验" in combined_text: print("\n✓ 识别结果包含'检验'相关文字") return True else: print("未识别到任何文本") return False def test_crop_image(): """Test OCR on the original crop image.""" print("\n" + "=" * 80) print("测试原始印章裁剪图像") print("=" * 80) crop_path = Path("test_reports_full/3.pdf/seal_crop_0.png") if not crop_path.exists(): print(f"错误:裁剪图像不存在: {crop_path}") return False print(f"\n裁剪图像: {crop_path}") # Initialize PaddleOCR ocr = PaddleOCR(use_angle_cls=True, lang='ch') # Run OCR print("识别裁剪印章图像...") result = ocr.predict(str(crop_path)) if result and len(result) > 0 and result[0]: print(f"\n识别到 {len(result[0])} 个文本块:") all_text = [] for i, line in enumerate(result[0]): text_info = line[1] # text_info might be a string or a list if isinstance(text_info, list): text = text_info[0] confidence = text_info[1] if len(text_info) > 1 else 0.0 else: text = str(text_info) confidence = 0.0 print(f" 文字 {i+1}: '{text}' (置信度: {confidence:.4f})") all_text.append(text) combined_text = ''.join(all_text) print(f"\n合并文字: '{combined_text}'") return True else: print("未识别到任何文本") return False def check_html_report(): """Check what the HTML report says.""" print("\n" + "=" * 80) print("检查HTML报告") print("=" * 80) html_path = Path("test_reports_full/3.pdf/index.html") if not html_path.exists(): print(f"错误:HTML报告不存在: {html_path}") return False # Read and parse HTML content = html_path.read_text(encoding='utf-8') # Look for institution info import re # Find extracted institution extracted_match = re.search(r'Extracted Institution.*?