308 lines
13 KiB
Python
308 lines
13 KiB
Python
"""
|
||
诊断CRT提取问题 - 检查YDQ25_002294.pdf和YDQ23_001838.pdf的数字签名状态
|
||
"""
|
||
import sys
|
||
import pikepdf
|
||
from pathlib import Path
|
||
|
||
def check_pdf_signature(pdf_path):
|
||
"""
|
||
检查PDF是否包含数字签名
|
||
|
||
Returns:
|
||
dict: {
|
||
'has_signature': bool,
|
||
'num_signatures': int,
|
||
'signature_info': list,
|
||
'is_encrypted': bool,
|
||
'error': str or None
|
||
}
|
||
"""
|
||
result = {
|
||
'pdf_name': Path(pdf_path).name,
|
||
'has_signature': False,
|
||
'num_signatures': 0,
|
||
'signature_info': [],
|
||
'is_encrypted': False,
|
||
'is_locked': False,
|
||
'error': None
|
||
}
|
||
|
||
try:
|
||
# 尝试打开PDF
|
||
with pikepdf.open(pdf_path) as pdf:
|
||
# 检查是否加密
|
||
result['is_encrypted'] = pdf.is_encrypted
|
||
|
||
# 检查acroform字段(数字签名通常在acroform中)
|
||
if '/AcroForm' in pdf.Root:
|
||
acroform = pdf.Root.AcroForm
|
||
if '/Fields' in acroform:
|
||
fields = acroform.Fields
|
||
sig_fields = []
|
||
|
||
for field in fields:
|
||
if '/FT' in field and field.FT == '/Sig':
|
||
sig_fields.append(field)
|
||
|
||
result['num_signatures'] = len(sig_fields)
|
||
result['has_signature'] = len(sig_fields) > 0
|
||
|
||
for i, sig_field in enumerate(sig_fields):
|
||
info = {
|
||
'index': i,
|
||
'has_value': '/V' in sig_field,
|
||
}
|
||
|
||
if '/V' in sig_field:
|
||
# 尝试读取签名值
|
||
try:
|
||
sig_value = sig_field.V
|
||
info['has_content'] = True
|
||
|
||
# 打印签名字段的所有键
|
||
info['keys'] = list(sig_value.keys())
|
||
|
||
# 检查签名中是否有机构名称
|
||
if '/Name' in sig_value:
|
||
info['signer_name'] = str(sig_value.Name)
|
||
|
||
# 检查签名中的证书信息
|
||
if '/Contents' in sig_value:
|
||
info['has_certificate_data'] = True
|
||
# 尝试解码证书数据
|
||
try:
|
||
contents = sig_value.Contents
|
||
if isinstance(contents, bytes):
|
||
# PKCS#7格式的签名数据
|
||
info['certificate_size'] = len(contents)
|
||
|
||
# 尝试查找机构名称字符串(在证书数据中)
|
||
cert_str = str(contents)
|
||
# 常见机构名称
|
||
institutions = [
|
||
"广东产品质量监督检验研究院",
|
||
"广东产品质量监督检验",
|
||
"广东省产品质量监督检验研究院",
|
||
"质量监督检验"
|
||
]
|
||
for inst in institutions:
|
||
if inst.encode('utf-8') in contents:
|
||
info['institution_in_cert'] = inst
|
||
break
|
||
except Exception as e:
|
||
info['cert_decode_error'] = str(e)
|
||
|
||
# 检查其他可能的字段
|
||
if '/Reason' in sig_value:
|
||
info['reason'] = str(sig_value.Reason)
|
||
if '/Location' in sig_value:
|
||
info['location'] = str(sig_value.Location)
|
||
if '/M' in sig_value:
|
||
info['modification_date'] = str(sig_value.M)
|
||
|
||
except Exception as e:
|
||
info['error'] = str(e)
|
||
|
||
result['signature_info'].append(info)
|
||
|
||
# 检查文档权限
|
||
try:
|
||
perms = pdf.allow
|
||
result['permissions'] = perms
|
||
except:
|
||
pass
|
||
|
||
except pikepdf.PasswordError:
|
||
result['error'] = "PDF is password-protected"
|
||
result['is_locked'] = True
|
||
except Exception as e:
|
||
result['error'] = f"Failed to open PDF: {str(e)}"
|
||
|
||
return result
|
||
|
||
def extract_crt_from_pdf(pdf_path):
|
||
"""
|
||
尝试从PDF中提取CRT机构名称
|
||
"""
|
||
result = {
|
||
'pdf_name': Path(pdf_path).name,
|
||
'success': False,
|
||
'institution': None,
|
||
'method': None,
|
||
'error': None
|
||
}
|
||
|
||
try:
|
||
with pikepdf.open(pdf_path) as pdf:
|
||
# 方法1: 从AcroForm签名字段提取
|
||
if '/AcroForm' in pdf.Root:
|
||
acroform = pdf.Root.AcroForm
|
||
if '/Fields' in acroform:
|
||
for field in acroform.Fields:
|
||
if '/FT' in field and field.FT == '/Sig' and '/V' in field:
|
||
sig_value = field.V
|
||
|
||
# 尝试1: 直接从/Name字段读取
|
||
if '/Name' in sig_value:
|
||
result['success'] = True
|
||
result['institution'] = str(sig_value.Name)
|
||
result['method'] = 'acroform_signature_name'
|
||
return result
|
||
|
||
# 尝试2: 从证书数据(/Contents)中查找机构名称
|
||
if '/Contents' in sig_value:
|
||
try:
|
||
contents = sig_value.Contents
|
||
if isinstance(contents, bytes):
|
||
# 常见机构名称列表
|
||
institutions = [
|
||
"广东产品质量监督检验研究院",
|
||
"广东产品质量监督检验",
|
||
"广东省产品质量监督检验研究院",
|
||
"质量监督检验研究院",
|
||
"产品质量监督检验"
|
||
]
|
||
|
||
# 在证书数据中查找UTF-8编码的机构名称
|
||
for inst in institutions:
|
||
if inst.encode('utf-8') in contents:
|
||
result['success'] = True
|
||
result['institution'] = inst
|
||
result['method'] = 'acroform_certificate_data'
|
||
return result
|
||
except Exception as e:
|
||
result['cert_error'] = str(e)
|
||
|
||
# 尝试3: 从/Reason或/Location字段读取
|
||
if '/Reason' in sig_value:
|
||
reason = str(sig_value.Reason)
|
||
if reason and len(reason) > 3:
|
||
result['success'] = True
|
||
result['institution'] = reason
|
||
result['method'] = 'acroform_signature_reason'
|
||
return result
|
||
|
||
if '/Location' in sig_value:
|
||
location = str(sig_value.Location)
|
||
if location and len(location) > 3:
|
||
result['success'] = True
|
||
result['institution'] = location
|
||
result['method'] = 'acroform_signature_location'
|
||
return result
|
||
|
||
# 方法2: 检查文档元数据
|
||
if '/Metadata' in pdf.Root:
|
||
try:
|
||
metadata = pdf.Root.Metadata
|
||
# 这里可以添加更多的元数据解析逻辑
|
||
except:
|
||
pass
|
||
|
||
# 方法3: 检查文档信息字典
|
||
if '/Info' in pdf.Root:
|
||
info = pdf.Root.Info
|
||
if '/Author' in info:
|
||
result['success'] = True
|
||
result['institution'] = str(info.Author)
|
||
result['method'] = 'document_info_author'
|
||
return result
|
||
if '/Subject' in info:
|
||
result['success'] = True
|
||
result['institution'] = str(info.Subject)
|
||
result['method'] = 'document_info_subject'
|
||
return result
|
||
|
||
result['error'] = "No signature or institution name found in PDF"
|
||
|
||
except Exception as e:
|
||
result['error'] = f"Extraction failed: {str(e)}"
|
||
|
||
return result
|
||
|
||
def main():
|
||
print("="*80)
|
||
print("CRT EXTRACTION DIAGNOSTIC REPORT")
|
||
print("="*80)
|
||
|
||
test_pdfs = [
|
||
"src/test/resources/data/pdfs/YDQ25_002294.pdf",
|
||
"src/test/resources/data/pdfs/YDQ23_001838.pdf"
|
||
]
|
||
|
||
for pdf_path in test_pdfs:
|
||
print(f"\n{'#'*80}")
|
||
print(f"PDF: {Path(pdf_path).name}")
|
||
print(f"{'#'*80}\n")
|
||
|
||
# 检查签名状态
|
||
print("1. SIGNATURE STATUS CHECK")
|
||
print("-" * 80)
|
||
sig_check = check_pdf_signature(pdf_path)
|
||
|
||
print(f"Has digital signature: {sig_check['has_signature']}")
|
||
print(f"Number of signatures: {sig_check['num_signatures']}")
|
||
print(f"Is encrypted: {sig_check['is_encrypted']}")
|
||
print(f"Is locked: {sig_check['is_locked']}")
|
||
|
||
if sig_check['error']:
|
||
print(f"ERROR: {sig_check['error']}")
|
||
|
||
if sig_check['signature_info']:
|
||
print("\nSignature details:")
|
||
for info in sig_check['signature_info']:
|
||
print(f" Signature #{info['index']}:")
|
||
print(f" Has value: {info.get('has_value', False)}")
|
||
if 'keys' in info:
|
||
print(f" Keys in signature: {info['keys']}")
|
||
if 'signer_name' in info:
|
||
print(f" Signer name: {info['signer_name']}")
|
||
if 'institution_in_cert' in info:
|
||
print(f" Institution found in certificate: {info['institution_in_cert']}")
|
||
if 'certificate_size' in info:
|
||
print(f" Certificate data size: {info['certificate_size']} bytes")
|
||
if 'reason' in info:
|
||
print(f" Reason: {info['reason']}")
|
||
if 'location' in info:
|
||
print(f" Location: {info['location']}")
|
||
if 'error' in info:
|
||
print(f" Error: {info['error']}")
|
||
|
||
# 只显示前3个签名的详细信息,避免输出太多
|
||
if info['index'] >= 2:
|
||
print(f" ... (and {len(sig_check['signature_info']) - 3} more signatures)")
|
||
break
|
||
|
||
# 尝试提取CRT
|
||
print("\n2. CRT EXTRACTION ATTEMPT")
|
||
print("-" * 80)
|
||
extraction_result = extract_crt_from_pdf(pdf_path)
|
||
|
||
print(f"Success: {extraction_result['success']}")
|
||
print(f"Method: {extraction_result['method']}")
|
||
print(f"Institution: {extraction_result['institution']}")
|
||
|
||
if extraction_result['error']:
|
||
print(f"ERROR: {extraction_result['error']}")
|
||
|
||
# 总结
|
||
print("\n3. SUMMARY")
|
||
print("-" * 80)
|
||
if sig_check['has_signature']:
|
||
print(f"[OK] PDF contains digital signatures")
|
||
if extraction_result['success']:
|
||
print(f"[OK] CRT extraction SUCCESSFUL: {extraction_result['institution']}")
|
||
else:
|
||
print(f"[FAIL] CRT extraction FAILED despite having signatures")
|
||
else:
|
||
print(f"[FAIL] PDF does NOT contain digital signatures")
|
||
print(f" -> CRT extraction is not possible (likely a scanned PDF)")
|
||
print(f" -> OCR-based extraction should be used instead")
|
||
|
||
print("\n" + "="*80)
|
||
print("DIAGNOSTIC COMPLETE")
|
||
print("="*80)
|
||
|
||
if __name__ == "__main__":
|
||
main()
|