""" 诊断CRT提取问题 - 检查YDQ25_002294.pdf和YDQ23_001838.pdf的数字签名状态 """ import sys import pikepdf from pathlib import Path def check_pdf_signature(pdf_path): """ 检查PDF是否包含数字签名 Returns: dict: { 'has_signature': bool, 'num_signatures': int, 'signature_info': list, 'is_encrypted': bool, 'error': str or None } """ result = { 'pdf_name': Path(pdf_path).name, 'has_signature': False, 'num_signatures': 0, 'signature_info': [], 'is_encrypted': False, 'is_locked': False, 'error': None } try: # 尝试打开PDF with pikepdf.open(pdf_path) as pdf: # 检查是否加密 result['is_encrypted'] = pdf.is_encrypted # 检查acroform字段(数字签名通常在acroform中) if '/AcroForm' in pdf.Root: acroform = pdf.Root.AcroForm if '/Fields' in acroform: fields = acroform.Fields sig_fields = [] for field in fields: if '/FT' in field and field.FT == '/Sig': sig_fields.append(field) result['num_signatures'] = len(sig_fields) result['has_signature'] = len(sig_fields) > 0 for i, sig_field in enumerate(sig_fields): info = { 'index': i, 'has_value': '/V' in sig_field, } if '/V' in sig_field: # 尝试读取签名值 try: sig_value = sig_field.V info['has_content'] = True # 打印签名字段的所有键 info['keys'] = list(sig_value.keys()) # 检查签名中是否有机构名称 if '/Name' in sig_value: info['signer_name'] = str(sig_value.Name) # 检查签名中的证书信息 if '/Contents' in sig_value: info['has_certificate_data'] = True # 尝试解码证书数据 try: contents = sig_value.Contents if isinstance(contents, bytes): # PKCS#7格式的签名数据 info['certificate_size'] = len(contents) # 尝试查找机构名称字符串(在证书数据中) cert_str = str(contents) # 常见机构名称 institutions = [ "广东产品质量监督检验研究院", "广东产品质量监督检验", "广东省产品质量监督检验研究院", "质量监督检验" ] for inst in institutions: if inst.encode('utf-8') in contents: info['institution_in_cert'] = inst break except Exception as e: info['cert_decode_error'] = str(e) # 检查其他可能的字段 if '/Reason' in sig_value: info['reason'] = str(sig_value.Reason) if '/Location' in sig_value: info['location'] = str(sig_value.Location) if '/M' in sig_value: info['modification_date'] = str(sig_value.M) except Exception as e: info['error'] = str(e) result['signature_info'].append(info) # 检查文档权限 try: perms = pdf.allow result['permissions'] = perms except: pass except pikepdf.PasswordError: result['error'] = "PDF is password-protected" result['is_locked'] = True except Exception as e: result['error'] = f"Failed to open PDF: {str(e)}" return result def extract_crt_from_pdf(pdf_path): """ 尝试从PDF中提取CRT机构名称 """ result = { 'pdf_name': Path(pdf_path).name, 'success': False, 'institution': None, 'method': None, 'error': None } try: with pikepdf.open(pdf_path) as pdf: # 方法1: 从AcroForm签名字段提取 if '/AcroForm' in pdf.Root: acroform = pdf.Root.AcroForm if '/Fields' in acroform: for field in acroform.Fields: if '/FT' in field and field.FT == '/Sig' and '/V' in field: sig_value = field.V # 尝试1: 直接从/Name字段读取 if '/Name' in sig_value: result['success'] = True result['institution'] = str(sig_value.Name) result['method'] = 'acroform_signature_name' return result # 尝试2: 从证书数据(/Contents)中查找机构名称 if '/Contents' in sig_value: try: contents = sig_value.Contents if isinstance(contents, bytes): # 常见机构名称列表 institutions = [ "广东产品质量监督检验研究院", "广东产品质量监督检验", "广东省产品质量监督检验研究院", "质量监督检验研究院", "产品质量监督检验" ] # 在证书数据中查找UTF-8编码的机构名称 for inst in institutions: if inst.encode('utf-8') in contents: result['success'] = True result['institution'] = inst result['method'] = 'acroform_certificate_data' return result except Exception as e: result['cert_error'] = str(e) # 尝试3: 从/Reason或/Location字段读取 if '/Reason' in sig_value: reason = str(sig_value.Reason) if reason and len(reason) > 3: result['success'] = True result['institution'] = reason result['method'] = 'acroform_signature_reason' return result if '/Location' in sig_value: location = str(sig_value.Location) if location and len(location) > 3: result['success'] = True result['institution'] = location result['method'] = 'acroform_signature_location' return result # 方法2: 检查文档元数据 if '/Metadata' in pdf.Root: try: metadata = pdf.Root.Metadata # 这里可以添加更多的元数据解析逻辑 except: pass # 方法3: 检查文档信息字典 if '/Info' in pdf.Root: info = pdf.Root.Info if '/Author' in info: result['success'] = True result['institution'] = str(info.Author) result['method'] = 'document_info_author' return result if '/Subject' in info: result['success'] = True result['institution'] = str(info.Subject) result['method'] = 'document_info_subject' return result result['error'] = "No signature or institution name found in PDF" except Exception as e: result['error'] = f"Extraction failed: {str(e)}" return result def main(): print("="*80) print("CRT EXTRACTION DIAGNOSTIC REPORT") print("="*80) test_pdfs = [ "src/test/resources/data/pdfs/YDQ25_002294.pdf", "src/test/resources/data/pdfs/YDQ23_001838.pdf" ] for pdf_path in test_pdfs: print(f"\n{'#'*80}") print(f"PDF: {Path(pdf_path).name}") print(f"{'#'*80}\n") # 检查签名状态 print("1. SIGNATURE STATUS CHECK") print("-" * 80) sig_check = check_pdf_signature(pdf_path) print(f"Has digital signature: {sig_check['has_signature']}") print(f"Number of signatures: {sig_check['num_signatures']}") print(f"Is encrypted: {sig_check['is_encrypted']}") print(f"Is locked: {sig_check['is_locked']}") if sig_check['error']: print(f"ERROR: {sig_check['error']}") if sig_check['signature_info']: print("\nSignature details:") for info in sig_check['signature_info']: print(f" Signature #{info['index']}:") print(f" Has value: {info.get('has_value', False)}") if 'keys' in info: print(f" Keys in signature: {info['keys']}") if 'signer_name' in info: print(f" Signer name: {info['signer_name']}") if 'institution_in_cert' in info: print(f" Institution found in certificate: {info['institution_in_cert']}") if 'certificate_size' in info: print(f" Certificate data size: {info['certificate_size']} bytes") if 'reason' in info: print(f" Reason: {info['reason']}") if 'location' in info: print(f" Location: {info['location']}") if 'error' in info: print(f" Error: {info['error']}") # 只显示前3个签名的详细信息,避免输出太多 if info['index'] >= 2: print(f" ... (and {len(sig_check['signature_info']) - 3} more signatures)") break # 尝试提取CRT print("\n2. CRT EXTRACTION ATTEMPT") print("-" * 80) extraction_result = extract_crt_from_pdf(pdf_path) print(f"Success: {extraction_result['success']}") print(f"Method: {extraction_result['method']}") print(f"Institution: {extraction_result['institution']}") if extraction_result['error']: print(f"ERROR: {extraction_result['error']}") # 总结 print("\n3. SUMMARY") print("-" * 80) if sig_check['has_signature']: print(f"[OK] PDF contains digital signatures") if extraction_result['success']: print(f"[OK] CRT extraction SUCCESSFUL: {extraction_result['institution']}") else: print(f"[FAIL] CRT extraction FAILED despite having signatures") else: print(f"[FAIL] PDF does NOT contain digital signatures") print(f" -> CRT extraction is not possible (likely a scanned PDF)") print(f" -> OCR-based extraction should be used instead") print("\n" + "="*80) print("DIAGNOSTIC COMPLETE") print("="*80) if __name__ == "__main__": main()