report-detect/archive/crt_tests/inspect_certificate_data.py

"""
深度检查PDF签名中的证书数据
"""
import pikepdf
import re
from pathlib import Path

def inspect_certificate_data(pdf_path):
    """检查证书数据的内容"""
    print(f"\n{'='*80}")
    print(f"INSPECTING: {Path(pdf_path).name}")
    print(f"{'='*80}\n")

    try:
        with pikepdf.open(pdf_path) as pdf:
            if '/AcroForm' in pdf.Root:
                acroform = pdf.Root.AcroForm
                if '/Fields' in acroform:
                    sig_count = 0
                    for field in acroform.Fields:
                        if '/FT' in field and field.FT == '/Sig' and '/V' in field:
                            sig_count += 1
                            if sig_count > 3:  # 只检查前3个签名
                                break

                            sig_value = field.V
                            print(f"Signature #{sig_count - 1}:")
                            print(f"  Keys: {list(sig_value.keys())}")

                            if '/Contents' in sig_value:
                                contents = sig_value.Contents
                                print(f"  Contents type: {type(contents)}")

                                # PikePDF Object需要转换为bytes
                                try:
                                    if hasattr(contents, '__bytes__'):
                                        contents_bytes = bytes(contents)
                                    else:
                                        # 尝试直接访问
                                        contents_bytes = contents._obj

                                    print(f"  Contents bytes type: {type(contents_bytes)}")

                                    if isinstance(contents_bytes, (bytes, bytearray)):
                                        print(f"  Certificate data size: {len(contents_bytes)} bytes")
                                        print(f"  Certificate data (first 200 bytes, hex): {contents_bytes[:200].hex()}")
                                        print(f"  Certificate data (first 200 bytes, repr): {repr(contents_bytes[:200])}")

                                        # 尝试UTF-8解码
                                        try:
                                            decoded = contents_bytes.decode('utf-8', errors='ignore')
                                            print(f"  UTF-8 decoded (first 500 chars): {decoded[:500]}")

                                            # 查找机构名称模式
                                            patterns = [
                                                r'(广东产品质量监督检验研究院)',
                                                r'(广东省?产品质量监督检验)',
                                                r'(质量监督检验)',
                                                r'O=([^,\n]+)',  # X.509 Organization field
                                                r'CN=([^,\n]+)',  # X.509 Common Name field
                                            ]

                                            for pattern in patterns:
                                                matches = re.findall(pattern, decoded)
                                                if matches:
                                                    print(f"  Pattern '{pattern}' found: {matches}")
                                        except Exception as e:
                                            print(f"  UTF-8 decode error: {e}")

                                        # 检查是否包含特定的UTF-8编码字符串
                                        target_institutions = [
                                            "广东产品质量监督检验研究院",
                                            "广东产品质量监督检验",
                                            "广东省产品质量监督检验研究院",
                                        ]

                                        for inst in target_institutions:
                                            encoded = inst.encode('utf-8')
                                            if encoded in contents_bytes:
                                                print(f"  FOUND IN CERTIFICATE DATA: {inst}")
                                                print(f"    Encoded bytes: {encoded.hex()}")
                                                print(f"    Position: {contents_bytes.find(encoded)}")
                                    else:
                                        print(f"  Contents is NOT bytes/bytearray, type: {type(contents_bytes)}")
                                        print(f"  Contents value: {contents_bytes}")

                                except Exception as e:
                                    print(f"  ERROR converting Contents to bytes: {e}")
                                    import traceback
                                    traceback.print_exc()

                            if '/Reason' in sig_value:
                                reason = str(sig_value.Reason)
                                print(f"  Reason: '{reason}' (length: {len(reason)})")
                                if reason:
                                    try:
                                        print(f"    Reason bytes: {reason.encode('utf-8')}")
                                    except:
                                        pass

                            if '/Location' in sig_value:
                                location = str(sig_value.Location)
                                print(f"  Location: '{location}' (length: {len(location)})")
                                if location:
                                    try:
                                        print(f"    Location bytes: {location.encode('utf-8')}")
                                    except:
                                        pass

                            print()

    except Exception as e:
        print(f"ERROR: {e}")
        import traceback
        traceback.print_exc()

def main():
    test_pdfs = [
        "src/test/resources/data/pdfs/YDQ25_002294.pdf",
        "src/test/resources/data/pdfs/YDQ23_001838.pdf",
    ]

    for pdf_path in test_pdfs:
        inspect_certificate_data(pdf_path)

    print("\n" + "="*80)
    print("INSPECTION COMPLETE")
    print("="*80)

if __name__ == "__main__":
    main()