report-detect/archive/crt_tests/inspect_certificate_data.py

132 lines
6.3 KiB
Python
Raw Normal View History

chore(project): conservative cleanup - archive temp scripts and old docs Major cleanup to improve project organization and maintainability. Changes: - Moved 34 temp/debug/test scripts to archive/temp_scripts/ - Moved 9 auxiliary tools to archive/tools/ - Moved 3 CRT test scripts to archive/crt_tests/ - Moved 4 OCR test scripts to archive/ocr_tests/ - Moved 14 old documentation files to archive/docs/ - Deleted 4 useless files (duplicates, temp files) Root directory: - Before: 67 files (cluttered) - After: 10 core files (clean and organized) Core files retained: - test_accuracy_batch_full.py (main script) - cma_extraction_template_primary.py (CMA extraction) - cma_extraction_final.py (backup CMA extraction) - CLAUDE.md (project guide) - TEST_ACCURACY_BATCH_README.md (usage guide) - TEST_ACCURACY_BATCH_DEPENDENCIES.md (dependency docs) - CLEANUP_PLAN.md (cleanup plan) - CLEANUP_SUMMARY.md (this file) - IMPLEMENTATION_SUMMARY.md (implementation summary) - requirements.txt (dependencies) Archive structure: archive/ ├── temp_scripts/ (34 files: test_, debug_, analyze_, etc.) ├── tools/ (9 files: find_, show_, visualize_, etc.) ├── crt_tests/ (3 files: CRT extraction tests) ├── ocr_tests/ (4 files: OCR timeout tests) └── docs/ (14 files: old reports and guides) Benefits: ✓ Cleaner root directory - easier navigation ✓ Better organization - clear separation of concerns ✓ Preserved history - all files archived, not deleted ✓ Improved maintainability - easier to find active files ✓ Better git history - removed 198 deleted files from tracking No functional changes - all core functionality preserved. Related: - TEST_ACCURACY_BATCH_DEPENDENCIES.md - dependency analysis - CLEANUP_PLAN.md - detailed cleanup plan Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-03 14:35:06 +08:00
"""
深度检查PDF签名中的证书数据
"""
import pikepdf
import re
from pathlib import Path
def inspect_certificate_data(pdf_path):
"""检查证书数据的内容"""
print(f"\n{'='*80}")
print(f"INSPECTING: {Path(pdf_path).name}")
print(f"{'='*80}\n")
try:
with pikepdf.open(pdf_path) as pdf:
if '/AcroForm' in pdf.Root:
acroform = pdf.Root.AcroForm
if '/Fields' in acroform:
sig_count = 0
for field in acroform.Fields:
if '/FT' in field and field.FT == '/Sig' and '/V' in field:
sig_count += 1
if sig_count > 3: # 只检查前3个签名
break
sig_value = field.V
print(f"Signature #{sig_count - 1}:")
print(f" Keys: {list(sig_value.keys())}")
if '/Contents' in sig_value:
contents = sig_value.Contents
print(f" Contents type: {type(contents)}")
# PikePDF Object需要转换为bytes
try:
if hasattr(contents, '__bytes__'):
contents_bytes = bytes(contents)
else:
# 尝试直接访问
contents_bytes = contents._obj
print(f" Contents bytes type: {type(contents_bytes)}")
if isinstance(contents_bytes, (bytes, bytearray)):
print(f" Certificate data size: {len(contents_bytes)} bytes")
print(f" Certificate data (first 200 bytes, hex): {contents_bytes[:200].hex()}")
print(f" Certificate data (first 200 bytes, repr): {repr(contents_bytes[:200])}")
# 尝试UTF-8解码
try:
decoded = contents_bytes.decode('utf-8', errors='ignore')
print(f" UTF-8 decoded (first 500 chars): {decoded[:500]}")
# 查找机构名称模式
patterns = [
r'(广东产品质量监督检验研究院)',
r'(广东省?产品质量监督检验)',
r'(质量监督检验)',
r'O=([^,\n]+)', # X.509 Organization field
r'CN=([^,\n]+)', # X.509 Common Name field
]
for pattern in patterns:
matches = re.findall(pattern, decoded)
if matches:
print(f" Pattern '{pattern}' found: {matches}")
except Exception as e:
print(f" UTF-8 decode error: {e}")
# 检查是否包含特定的UTF-8编码字符串
target_institutions = [
"广东产品质量监督检验研究院",
"广东产品质量监督检验",
"广东省产品质量监督检验研究院",
]
for inst in target_institutions:
encoded = inst.encode('utf-8')
if encoded in contents_bytes:
print(f" FOUND IN CERTIFICATE DATA: {inst}")
print(f" Encoded bytes: {encoded.hex()}")
print(f" Position: {contents_bytes.find(encoded)}")
else:
print(f" Contents is NOT bytes/bytearray, type: {type(contents_bytes)}")
print(f" Contents value: {contents_bytes}")
except Exception as e:
print(f" ERROR converting Contents to bytes: {e}")
import traceback
traceback.print_exc()
if '/Reason' in sig_value:
reason = str(sig_value.Reason)
print(f" Reason: '{reason}' (length: {len(reason)})")
if reason:
try:
print(f" Reason bytes: {reason.encode('utf-8')}")
except:
pass
if '/Location' in sig_value:
location = str(sig_value.Location)
print(f" Location: '{location}' (length: {len(location)})")
if location:
try:
print(f" Location bytes: {location.encode('utf-8')}")
except:
pass
print()
except Exception as e:
print(f"ERROR: {e}")
import traceback
traceback.print_exc()
def main():
test_pdfs = [
"src/test/resources/data/pdfs/YDQ25_002294.pdf",
"src/test/resources/data/pdfs/YDQ23_001838.pdf",
]
for pdf_path in test_pdfs:
inspect_certificate_data(pdf_path)
print("\n" + "="*80)
print("INSPECTION COMPLETE")
print("="*80)
if __name__ == "__main__":
main()