report-detect/archive/crt_tests/inspect_certificate_data.py

"""
深度检查PDF签名中的证书数据
"""
import pikepdf
import re
from pathlib import Path

def inspect_certificate_data(pdf_path):
    """检查证书数据的内容"""
    print(f"\n{'='*80}")
    print(f"INSPECTING: {Path(pdf_path).name}")
    print(f"{'='*80}\n")

    try:
        with pikepdf.open(pdf_path) as pdf:
            if '/AcroForm' in pdf.Root:
                acroform = pdf.Root.AcroForm
                if '/Fields' in acroform:
                    sig_count = 0
                    for field in acroform.Fields:
                        if '/FT' in field and field.FT == '/Sig' and '/V' in field:
                            sig_count += 1
                            if sig_count > 3:  # 只检查前3个签名
                                break

                            sig_value = field.V
                            print(f"Signature #{sig_count - 1}:")
                            print(f"  Keys: {list(sig_value.keys())}")

                            if '/Contents' in sig_value:
                                contents = sig_value.Contents
                                print(f"  Contents type: {type(contents)}")

                                # PikePDF Object需要转换为bytes
                                try:
                                    if hasattr(contents, '__bytes__'):
                                        contents_bytes = bytes(contents)
                                    else:
                                        # 尝试直接访问
                                        contents_bytes = contents._obj

                                    print(f"  Contents bytes type: {type(contents_bytes)}")

                                    if isinstance(contents_bytes, (bytes, bytearray)):
                                        print(f"  Certificate data size: {len(contents_bytes)} bytes")
                                        print(f"  Certificate data (first 200 bytes, hex): {contents_bytes[:200].hex()}")
                                        print(f"  Certificate data (first 200 bytes, repr): {repr(contents_bytes[:200])}")

                                        # 尝试UTF-8解码
                                        try:
                                            decoded = contents_bytes.decode('utf-8', errors='ignore')
                                            print(f"  UTF-8 decoded (first 500 chars): {decoded[:500]}")

                                            # 查找机构名称模式
                                            patterns = [
                                                r'(广东产品质量监督检验研究院)',
                                                r'(广东省?产品质量监督检验)',
                                                r'(质量监督检验)',
                                                r'O=([^,\n]+)',  # X.509 Organization field
                                                r'CN=([^,\n]+)',  # X.509 Common Name field
                                            ]

                                            for pattern in patterns:
                                                matches = re.findall(pattern, decoded)
                                                if matches:
                                                    print(f"  Pattern '{pattern}' found: {matches}")
                                        except Exception as e:
                                            print(f"  UTF-8 decode error: {e}")

                                        # 检查是否包含特定的UTF-8编码字符串
                                        target_institutions = [
                                            "广东产品质量监督检验研究院",
                                            "广东产品质量监督检验",
                                            "广东省产品质量监督检验研究院",
                                        ]

                                        for inst in target_institutions:
                                            encoded = inst.encode('utf-8')
                                            if encoded in contents_bytes:
                                                print(f"  FOUND IN CERTIFICATE DATA: {inst}")
                                                print(f"    Encoded bytes: {encoded.hex()}")
                                                print(f"    Position: {contents_bytes.find(encoded)}")
                                    else:
                                        print(f"  Contents is NOT bytes/bytearray, type: {type(contents_bytes)}")
                                        print(f"  Contents value: {contents_bytes}")

                                except Exception as e:
                                    print(f"  ERROR converting Contents to bytes: {e}")
                                    import traceback
                                    traceback.print_exc()

                            if '/Reason' in sig_value:
                                reason = str(sig_value.Reason)
                                print(f"  Reason: '{reason}' (length: {len(reason)})")
                                if reason:
                                    try:
                                        print(f"    Reason bytes: {reason.encode('utf-8')}")
                                    except:
                                        pass

                            if '/Location' in sig_value:
                                location = str(sig_value.Location)
                                print(f"  Location: '{location}' (length: {len(location)})")
                                if location:
                                    try:
                                        print(f"    Location bytes: {location.encode('utf-8')}")
                                    except:
                                        pass

                            print()

    except Exception as e:
        print(f"ERROR: {e}")
        import traceback
        traceback.print_exc()

def main():
    test_pdfs = [
        "src/test/resources/data/pdfs/YDQ25_002294.pdf",
        "src/test/resources/data/pdfs/YDQ23_001838.pdf",
    ]

    for pdf_path in test_pdfs:
        inspect_certificate_data(pdf_path)

    print("\n" + "="*80)
    print("INSPECTION COMPLETE")
    print("="*80)

if __name__ == "__main__":
    main()
chore(project): conservative cleanup - archive temp scripts and old docs Major cleanup to improve project organization and maintainability. Changes: - Moved 34 temp/debug/test scripts to archive/temp_scripts/ - Moved 9 auxiliary tools to archive/tools/ - Moved 3 CRT test scripts to archive/crt_tests/ - Moved 4 OCR test scripts to archive/ocr_tests/ - Moved 14 old documentation files to archive/docs/ - Deleted 4 useless files (duplicates, temp files) Root directory: - Before: 67 files (cluttered) - After: 10 core files (clean and organized) Core files retained: - test_accuracy_batch_full.py (main script) - cma_extraction_template_primary.py (CMA extraction) - cma_extraction_final.py (backup CMA extraction) - CLAUDE.md (project guide) - TEST_ACCURACY_BATCH_README.md (usage guide) - TEST_ACCURACY_BATCH_DEPENDENCIES.md (dependency docs) - CLEANUP_PLAN.md (cleanup plan) - CLEANUP_SUMMARY.md (this file) - IMPLEMENTATION_SUMMARY.md (implementation summary) - requirements.txt (dependencies) Archive structure: archive/ ├── temp_scripts/ (34 files: test_, debug_, analyze_, etc.) ├── tools/ (9 files: find_, show_, visualize_, etc.) ├── crt_tests/ (3 files: CRT extraction tests) ├── ocr_tests/ (4 files: OCR timeout tests) └── docs/ (14 files: old reports and guides) Benefits: ✓ Cleaner root directory - easier navigation ✓ Better organization - clear separation of concerns ✓ Preserved history - all files archived, not deleted ✓ Improved maintainability - easier to find active files ✓ Better git history - removed 198 deleted files from tracking No functional changes - all core functionality preserved. Related: - TEST_ACCURACY_BATCH_DEPENDENCIES.md - dependency analysis - CLEANUP_PLAN.md - detailed cleanup plan Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> 2026-03-03 14:35:06 +08:00			`"""`
			`深度检查PDF签名中的证书数据`
			`"""`
			`import pikepdf`
			`import re`
			`from pathlib import Path`

			`def inspect_certificate_data(pdf_path):`
			`"""检查证书数据的内容"""`
			`print(f"\n{'='*80}")`
			`print(f"INSPECTING: {Path(pdf_path).name}")`
			`print(f"{'='*80}\n")`

			`try:`
			`with pikepdf.open(pdf_path) as pdf:`
			`if '/AcroForm' in pdf.Root:`
			`acroform = pdf.Root.AcroForm`
			`if '/Fields' in acroform:`
			`sig_count = 0`
			`for field in acroform.Fields:`
			`if '/FT' in field and field.FT == '/Sig' and '/V' in field:`
			`sig_count += 1`
			`if sig_count > 3: # 只检查前3个签名`
			`break`

			`sig_value = field.V`
			`print(f"Signature #{sig_count - 1}:")`
			`print(f" Keys: {list(sig_value.keys())}")`

			`if '/Contents' in sig_value:`
			`contents = sig_value.Contents`
			`print(f" Contents type: {type(contents)}")`

			`# PikePDF Object需要转换为bytes`
			`try:`
			`if hasattr(contents, '__bytes__'):`
			`contents_bytes = bytes(contents)`
			`else:`
			`# 尝试直接访问`
			`contents_bytes = contents._obj`

			`print(f" Contents bytes type: {type(contents_bytes)}")`

			`if isinstance(contents_bytes, (bytes, bytearray)):`
			`print(f" Certificate data size: {len(contents_bytes)} bytes")`
			`print(f" Certificate data (first 200 bytes, hex): {contents_bytes[:200].hex()}")`
			`print(f" Certificate data (first 200 bytes, repr): {repr(contents_bytes[:200])}")`

			`# 尝试UTF-8解码`
			`try:`
			`decoded = contents_bytes.decode('utf-8', errors='ignore')`
			`print(f" UTF-8 decoded (first 500 chars): {decoded[:500]}")`

			`# 查找机构名称模式`
			`patterns = [`
			`r'(广东产品质量监督检验研究院)',`
			`r'(广东省?产品质量监督检验)',`
			`r'(质量监督检验)',`
			`r'O=([^,\n]+)', # X.509 Organization field`
			`r'CN=([^,\n]+)', # X.509 Common Name field`
			`]`

			`for pattern in patterns:`
			`matches = re.findall(pattern, decoded)`
			`if matches:`
			`print(f" Pattern '{pattern}' found: {matches}")`
			`except Exception as e:`
			`print(f" UTF-8 decode error: {e}")`

			`# 检查是否包含特定的UTF-8编码字符串`
			`target_institutions = [`
			`"广东产品质量监督检验研究院",`
			`"广东产品质量监督检验",`
			`"广东省产品质量监督检验研究院",`
			`]`

			`for inst in target_institutions:`
			`encoded = inst.encode('utf-8')`
			`if encoded in contents_bytes:`
			`print(f" FOUND IN CERTIFICATE DATA: {inst}")`
			`print(f" Encoded bytes: {encoded.hex()}")`
			`print(f" Position: {contents_bytes.find(encoded)}")`
			`else:`
			`print(f" Contents is NOT bytes/bytearray, type: {type(contents_bytes)}")`
			`print(f" Contents value: {contents_bytes}")`

			`except Exception as e:`
			`print(f" ERROR converting Contents to bytes: {e}")`
			`import traceback`
			`traceback.print_exc()`

			`if '/Reason' in sig_value:`
			`reason = str(sig_value.Reason)`
			`print(f" Reason: '{reason}' (length: {len(reason)})")`
			`if reason:`
			`try:`
			`print(f" Reason bytes: {reason.encode('utf-8')}")`
			`except:`
			`pass`

			`if '/Location' in sig_value:`
			`location = str(sig_value.Location)`
			`print(f" Location: '{location}' (length: {len(location)})")`
			`if location:`
			`try:`
			`print(f" Location bytes: {location.encode('utf-8')}")`
			`except:`
			`pass`

			`print()`

			`except Exception as e:`
			`print(f"ERROR: {e}")`
			`import traceback`
			`traceback.print_exc()`

			`def main():`
			`test_pdfs = [`
			`"src/test/resources/data/pdfs/YDQ25_002294.pdf",`
			`"src/test/resources/data/pdfs/YDQ23_001838.pdf",`
			`]`

			`for pdf_path in test_pdfs:`
			`inspect_certificate_data(pdf_path)`

			`print("\n" + "="*80)`
			`print("INSPECTION COMPLETE")`
			`print("="*80)`

			`if __name__ == "__main__":`
			`main()`