report-detect/archive/crt_tests/diagnose_crt_extraction.py

308 lines
13 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
诊断CRT提取问题 - 检查YDQ25_002294.pdf和YDQ23_001838.pdf的数字签名状态
"""
import sys
import pikepdf
from pathlib import Path
def check_pdf_signature(pdf_path):
"""
检查PDF是否包含数字签名
Returns:
dict: {
'has_signature': bool,
'num_signatures': int,
'signature_info': list,
'is_encrypted': bool,
'error': str or None
}
"""
result = {
'pdf_name': Path(pdf_path).name,
'has_signature': False,
'num_signatures': 0,
'signature_info': [],
'is_encrypted': False,
'is_locked': False,
'error': None
}
try:
# 尝试打开PDF
with pikepdf.open(pdf_path) as pdf:
# 检查是否加密
result['is_encrypted'] = pdf.is_encrypted
# 检查acroform字段数字签名通常在acroform中
if '/AcroForm' in pdf.Root:
acroform = pdf.Root.AcroForm
if '/Fields' in acroform:
fields = acroform.Fields
sig_fields = []
for field in fields:
if '/FT' in field and field.FT == '/Sig':
sig_fields.append(field)
result['num_signatures'] = len(sig_fields)
result['has_signature'] = len(sig_fields) > 0
for i, sig_field in enumerate(sig_fields):
info = {
'index': i,
'has_value': '/V' in sig_field,
}
if '/V' in sig_field:
# 尝试读取签名值
try:
sig_value = sig_field.V
info['has_content'] = True
# 打印签名字段的所有键
info['keys'] = list(sig_value.keys())
# 检查签名中是否有机构名称
if '/Name' in sig_value:
info['signer_name'] = str(sig_value.Name)
# 检查签名中的证书信息
if '/Contents' in sig_value:
info['has_certificate_data'] = True
# 尝试解码证书数据
try:
contents = sig_value.Contents
if isinstance(contents, bytes):
# PKCS#7格式的签名数据
info['certificate_size'] = len(contents)
# 尝试查找机构名称字符串(在证书数据中)
cert_str = str(contents)
# 常见机构名称
institutions = [
"广东产品质量监督检验研究院",
"广东产品质量监督检验",
"广东省产品质量监督检验研究院",
"质量监督检验"
]
for inst in institutions:
if inst.encode('utf-8') in contents:
info['institution_in_cert'] = inst
break
except Exception as e:
info['cert_decode_error'] = str(e)
# 检查其他可能的字段
if '/Reason' in sig_value:
info['reason'] = str(sig_value.Reason)
if '/Location' in sig_value:
info['location'] = str(sig_value.Location)
if '/M' in sig_value:
info['modification_date'] = str(sig_value.M)
except Exception as e:
info['error'] = str(e)
result['signature_info'].append(info)
# 检查文档权限
try:
perms = pdf.allow
result['permissions'] = perms
except:
pass
except pikepdf.PasswordError:
result['error'] = "PDF is password-protected"
result['is_locked'] = True
except Exception as e:
result['error'] = f"Failed to open PDF: {str(e)}"
return result
def extract_crt_from_pdf(pdf_path):
"""
尝试从PDF中提取CRT机构名称
"""
result = {
'pdf_name': Path(pdf_path).name,
'success': False,
'institution': None,
'method': None,
'error': None
}
try:
with pikepdf.open(pdf_path) as pdf:
# 方法1: 从AcroForm签名字段提取
if '/AcroForm' in pdf.Root:
acroform = pdf.Root.AcroForm
if '/Fields' in acroform:
for field in acroform.Fields:
if '/FT' in field and field.FT == '/Sig' and '/V' in field:
sig_value = field.V
# 尝试1: 直接从/Name字段读取
if '/Name' in sig_value:
result['success'] = True
result['institution'] = str(sig_value.Name)
result['method'] = 'acroform_signature_name'
return result
# 尝试2: 从证书数据(/Contents)中查找机构名称
if '/Contents' in sig_value:
try:
contents = sig_value.Contents
if isinstance(contents, bytes):
# 常见机构名称列表
institutions = [
"广东产品质量监督检验研究院",
"广东产品质量监督检验",
"广东省产品质量监督检验研究院",
"质量监督检验研究院",
"产品质量监督检验"
]
# 在证书数据中查找UTF-8编码的机构名称
for inst in institutions:
if inst.encode('utf-8') in contents:
result['success'] = True
result['institution'] = inst
result['method'] = 'acroform_certificate_data'
return result
except Exception as e:
result['cert_error'] = str(e)
# 尝试3: 从/Reason或/Location字段读取
if '/Reason' in sig_value:
reason = str(sig_value.Reason)
if reason and len(reason) > 3:
result['success'] = True
result['institution'] = reason
result['method'] = 'acroform_signature_reason'
return result
if '/Location' in sig_value:
location = str(sig_value.Location)
if location and len(location) > 3:
result['success'] = True
result['institution'] = location
result['method'] = 'acroform_signature_location'
return result
# 方法2: 检查文档元数据
if '/Metadata' in pdf.Root:
try:
metadata = pdf.Root.Metadata
# 这里可以添加更多的元数据解析逻辑
except:
pass
# 方法3: 检查文档信息字典
if '/Info' in pdf.Root:
info = pdf.Root.Info
if '/Author' in info:
result['success'] = True
result['institution'] = str(info.Author)
result['method'] = 'document_info_author'
return result
if '/Subject' in info:
result['success'] = True
result['institution'] = str(info.Subject)
result['method'] = 'document_info_subject'
return result
result['error'] = "No signature or institution name found in PDF"
except Exception as e:
result['error'] = f"Extraction failed: {str(e)}"
return result
def main():
print("="*80)
print("CRT EXTRACTION DIAGNOSTIC REPORT")
print("="*80)
test_pdfs = [
"src/test/resources/data/pdfs/YDQ25_002294.pdf",
"src/test/resources/data/pdfs/YDQ23_001838.pdf"
]
for pdf_path in test_pdfs:
print(f"\n{'#'*80}")
print(f"PDF: {Path(pdf_path).name}")
print(f"{'#'*80}\n")
# 检查签名状态
print("1. SIGNATURE STATUS CHECK")
print("-" * 80)
sig_check = check_pdf_signature(pdf_path)
print(f"Has digital signature: {sig_check['has_signature']}")
print(f"Number of signatures: {sig_check['num_signatures']}")
print(f"Is encrypted: {sig_check['is_encrypted']}")
print(f"Is locked: {sig_check['is_locked']}")
if sig_check['error']:
print(f"ERROR: {sig_check['error']}")
if sig_check['signature_info']:
print("\nSignature details:")
for info in sig_check['signature_info']:
print(f" Signature #{info['index']}:")
print(f" Has value: {info.get('has_value', False)}")
if 'keys' in info:
print(f" Keys in signature: {info['keys']}")
if 'signer_name' in info:
print(f" Signer name: {info['signer_name']}")
if 'institution_in_cert' in info:
print(f" Institution found in certificate: {info['institution_in_cert']}")
if 'certificate_size' in info:
print(f" Certificate data size: {info['certificate_size']} bytes")
if 'reason' in info:
print(f" Reason: {info['reason']}")
if 'location' in info:
print(f" Location: {info['location']}")
if 'error' in info:
print(f" Error: {info['error']}")
# 只显示前3个签名的详细信息避免输出太多
if info['index'] >= 2:
print(f" ... (and {len(sig_check['signature_info']) - 3} more signatures)")
break
# 尝试提取CRT
print("\n2. CRT EXTRACTION ATTEMPT")
print("-" * 80)
extraction_result = extract_crt_from_pdf(pdf_path)
print(f"Success: {extraction_result['success']}")
print(f"Method: {extraction_result['method']}")
print(f"Institution: {extraction_result['institution']}")
if extraction_result['error']:
print(f"ERROR: {extraction_result['error']}")
# 总结
print("\n3. SUMMARY")
print("-" * 80)
if sig_check['has_signature']:
print(f"[OK] PDF contains digital signatures")
if extraction_result['success']:
print(f"[OK] CRT extraction SUCCESSFUL: {extraction_result['institution']}")
else:
print(f"[FAIL] CRT extraction FAILED despite having signatures")
else:
print(f"[FAIL] PDF does NOT contain digital signatures")
print(f" -> CRT extraction is not possible (likely a scanned PDF)")
print(f" -> OCR-based extraction should be used instead")
print("\n" + "="*80)
print("DIAGNOSTIC COMPLETE")
print("="*80)
if __name__ == "__main__":
main()