report-detect/archive/crt_tests/diagnose_crt_extraction.py

308 lines
13 KiB
Python
Raw Permalink Normal View History

chore(project): conservative cleanup - archive temp scripts and old docs Major cleanup to improve project organization and maintainability. Changes: - Moved 34 temp/debug/test scripts to archive/temp_scripts/ - Moved 9 auxiliary tools to archive/tools/ - Moved 3 CRT test scripts to archive/crt_tests/ - Moved 4 OCR test scripts to archive/ocr_tests/ - Moved 14 old documentation files to archive/docs/ - Deleted 4 useless files (duplicates, temp files) Root directory: - Before: 67 files (cluttered) - After: 10 core files (clean and organized) Core files retained: - test_accuracy_batch_full.py (main script) - cma_extraction_template_primary.py (CMA extraction) - cma_extraction_final.py (backup CMA extraction) - CLAUDE.md (project guide) - TEST_ACCURACY_BATCH_README.md (usage guide) - TEST_ACCURACY_BATCH_DEPENDENCIES.md (dependency docs) - CLEANUP_PLAN.md (cleanup plan) - CLEANUP_SUMMARY.md (this file) - IMPLEMENTATION_SUMMARY.md (implementation summary) - requirements.txt (dependencies) Archive structure: archive/ ├── temp_scripts/ (34 files: test_, debug_, analyze_, etc.) ├── tools/ (9 files: find_, show_, visualize_, etc.) ├── crt_tests/ (3 files: CRT extraction tests) ├── ocr_tests/ (4 files: OCR timeout tests) └── docs/ (14 files: old reports and guides) Benefits: ✓ Cleaner root directory - easier navigation ✓ Better organization - clear separation of concerns ✓ Preserved history - all files archived, not deleted ✓ Improved maintainability - easier to find active files ✓ Better git history - removed 198 deleted files from tracking No functional changes - all core functionality preserved. Related: - TEST_ACCURACY_BATCH_DEPENDENCIES.md - dependency analysis - CLEANUP_PLAN.md - detailed cleanup plan Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-03 14:35:06 +08:00
"""
诊断CRT提取问题 - 检查YDQ25_002294.pdf和YDQ23_001838.pdf的数字签名状态
"""
import sys
import pikepdf
from pathlib import Path
def check_pdf_signature(pdf_path):
"""
检查PDF是否包含数字签名
Returns:
dict: {
'has_signature': bool,
'num_signatures': int,
'signature_info': list,
'is_encrypted': bool,
'error': str or None
}
"""
result = {
'pdf_name': Path(pdf_path).name,
'has_signature': False,
'num_signatures': 0,
'signature_info': [],
'is_encrypted': False,
'is_locked': False,
'error': None
}
try:
# 尝试打开PDF
with pikepdf.open(pdf_path) as pdf:
# 检查是否加密
result['is_encrypted'] = pdf.is_encrypted
# 检查acroform字段数字签名通常在acroform中
if '/AcroForm' in pdf.Root:
acroform = pdf.Root.AcroForm
if '/Fields' in acroform:
fields = acroform.Fields
sig_fields = []
for field in fields:
if '/FT' in field and field.FT == '/Sig':
sig_fields.append(field)
result['num_signatures'] = len(sig_fields)
result['has_signature'] = len(sig_fields) > 0
for i, sig_field in enumerate(sig_fields):
info = {
'index': i,
'has_value': '/V' in sig_field,
}
if '/V' in sig_field:
# 尝试读取签名值
try:
sig_value = sig_field.V
info['has_content'] = True
# 打印签名字段的所有键
info['keys'] = list(sig_value.keys())
# 检查签名中是否有机构名称
if '/Name' in sig_value:
info['signer_name'] = str(sig_value.Name)
# 检查签名中的证书信息
if '/Contents' in sig_value:
info['has_certificate_data'] = True
# 尝试解码证书数据
try:
contents = sig_value.Contents
if isinstance(contents, bytes):
# PKCS#7格式的签名数据
info['certificate_size'] = len(contents)
# 尝试查找机构名称字符串(在证书数据中)
cert_str = str(contents)
# 常见机构名称
institutions = [
"广东产品质量监督检验研究院",
"广东产品质量监督检验",
"广东省产品质量监督检验研究院",
"质量监督检验"
]
for inst in institutions:
if inst.encode('utf-8') in contents:
info['institution_in_cert'] = inst
break
except Exception as e:
info['cert_decode_error'] = str(e)
# 检查其他可能的字段
if '/Reason' in sig_value:
info['reason'] = str(sig_value.Reason)
if '/Location' in sig_value:
info['location'] = str(sig_value.Location)
if '/M' in sig_value:
info['modification_date'] = str(sig_value.M)
except Exception as e:
info['error'] = str(e)
result['signature_info'].append(info)
# 检查文档权限
try:
perms = pdf.allow
result['permissions'] = perms
except:
pass
except pikepdf.PasswordError:
result['error'] = "PDF is password-protected"
result['is_locked'] = True
except Exception as e:
result['error'] = f"Failed to open PDF: {str(e)}"
return result
def extract_crt_from_pdf(pdf_path):
"""
尝试从PDF中提取CRT机构名称
"""
result = {
'pdf_name': Path(pdf_path).name,
'success': False,
'institution': None,
'method': None,
'error': None
}
try:
with pikepdf.open(pdf_path) as pdf:
# 方法1: 从AcroForm签名字段提取
if '/AcroForm' in pdf.Root:
acroform = pdf.Root.AcroForm
if '/Fields' in acroform:
for field in acroform.Fields:
if '/FT' in field and field.FT == '/Sig' and '/V' in field:
sig_value = field.V
# 尝试1: 直接从/Name字段读取
if '/Name' in sig_value:
result['success'] = True
result['institution'] = str(sig_value.Name)
result['method'] = 'acroform_signature_name'
return result
# 尝试2: 从证书数据(/Contents)中查找机构名称
if '/Contents' in sig_value:
try:
contents = sig_value.Contents
if isinstance(contents, bytes):
# 常见机构名称列表
institutions = [
"广东产品质量监督检验研究院",
"广东产品质量监督检验",
"广东省产品质量监督检验研究院",
"质量监督检验研究院",
"产品质量监督检验"
]
# 在证书数据中查找UTF-8编码的机构名称
for inst in institutions:
if inst.encode('utf-8') in contents:
result['success'] = True
result['institution'] = inst
result['method'] = 'acroform_certificate_data'
return result
except Exception as e:
result['cert_error'] = str(e)
# 尝试3: 从/Reason或/Location字段读取
if '/Reason' in sig_value:
reason = str(sig_value.Reason)
if reason and len(reason) > 3:
result['success'] = True
result['institution'] = reason
result['method'] = 'acroform_signature_reason'
return result
if '/Location' in sig_value:
location = str(sig_value.Location)
if location and len(location) > 3:
result['success'] = True
result['institution'] = location
result['method'] = 'acroform_signature_location'
return result
# 方法2: 检查文档元数据
if '/Metadata' in pdf.Root:
try:
metadata = pdf.Root.Metadata
# 这里可以添加更多的元数据解析逻辑
except:
pass
# 方法3: 检查文档信息字典
if '/Info' in pdf.Root:
info = pdf.Root.Info
if '/Author' in info:
result['success'] = True
result['institution'] = str(info.Author)
result['method'] = 'document_info_author'
return result
if '/Subject' in info:
result['success'] = True
result['institution'] = str(info.Subject)
result['method'] = 'document_info_subject'
return result
result['error'] = "No signature or institution name found in PDF"
except Exception as e:
result['error'] = f"Extraction failed: {str(e)}"
return result
def main():
print("="*80)
print("CRT EXTRACTION DIAGNOSTIC REPORT")
print("="*80)
test_pdfs = [
"src/test/resources/data/pdfs/YDQ25_002294.pdf",
"src/test/resources/data/pdfs/YDQ23_001838.pdf"
]
for pdf_path in test_pdfs:
print(f"\n{'#'*80}")
print(f"PDF: {Path(pdf_path).name}")
print(f"{'#'*80}\n")
# 检查签名状态
print("1. SIGNATURE STATUS CHECK")
print("-" * 80)
sig_check = check_pdf_signature(pdf_path)
print(f"Has digital signature: {sig_check['has_signature']}")
print(f"Number of signatures: {sig_check['num_signatures']}")
print(f"Is encrypted: {sig_check['is_encrypted']}")
print(f"Is locked: {sig_check['is_locked']}")
if sig_check['error']:
print(f"ERROR: {sig_check['error']}")
if sig_check['signature_info']:
print("\nSignature details:")
for info in sig_check['signature_info']:
print(f" Signature #{info['index']}:")
print(f" Has value: {info.get('has_value', False)}")
if 'keys' in info:
print(f" Keys in signature: {info['keys']}")
if 'signer_name' in info:
print(f" Signer name: {info['signer_name']}")
if 'institution_in_cert' in info:
print(f" Institution found in certificate: {info['institution_in_cert']}")
if 'certificate_size' in info:
print(f" Certificate data size: {info['certificate_size']} bytes")
if 'reason' in info:
print(f" Reason: {info['reason']}")
if 'location' in info:
print(f" Location: {info['location']}")
if 'error' in info:
print(f" Error: {info['error']}")
# 只显示前3个签名的详细信息避免输出太多
if info['index'] >= 2:
print(f" ... (and {len(sig_check['signature_info']) - 3} more signatures)")
break
# 尝试提取CRT
print("\n2. CRT EXTRACTION ATTEMPT")
print("-" * 80)
extraction_result = extract_crt_from_pdf(pdf_path)
print(f"Success: {extraction_result['success']}")
print(f"Method: {extraction_result['method']}")
print(f"Institution: {extraction_result['institution']}")
if extraction_result['error']:
print(f"ERROR: {extraction_result['error']}")
# 总结
print("\n3. SUMMARY")
print("-" * 80)
if sig_check['has_signature']:
print(f"[OK] PDF contains digital signatures")
if extraction_result['success']:
print(f"[OK] CRT extraction SUCCESSFUL: {extraction_result['institution']}")
else:
print(f"[FAIL] CRT extraction FAILED despite having signatures")
else:
print(f"[FAIL] PDF does NOT contain digital signatures")
print(f" -> CRT extraction is not possible (likely a scanned PDF)")
print(f" -> OCR-based extraction should be used instead")
print("\n" + "="*80)
print("DIAGNOSTIC COMPLETE")
print("="*80)
if __name__ == "__main__":
main()