report-detect/archive/crt_tests/standalone_crt_test.py

"""
独立的CRT提取测试 - 不依赖大型模块
"""
import pikepdf
from cryptography.hazmat.primitives.serialization.pkcs7 import load_der_pkcs7_certificates
from cryptography.x509.oid import NameOID
import re

def _get_name_attr(name, oid: NameOID):
    """Extract attribute value from X.500 name by OID."""
    try:
        values = name.get_attributes_for_oid(oid)
    except ValueError:
        return None
    return values[0].value if values else None

def parse_certificates_improved(signature_bytes: bytes) -> list:
    """
    改进的证书解析函数，添加binary search fallback
    """
    candidates = []

    # Method 1: Try PKCS#7 parsing first
    try:
        certs = load_der_pkcs7_certificates(signature_bytes)

        # Usually first cert in bundle is signer's cert
        for cert in certs:
            # Collect potential organization names from CN, O, OU
            def add_if_valid(oid):
                val = _get_name_attr(cert.subject, oid)
                if val:
                    clean = val.strip()
                    if len(clean) >= 4 and clean not in candidates:
                        candidates.append(clean)

            add_if_valid(NameOID.COMMON_NAME)
            add_if_valid(NameOID.ORGANIZATION_NAME)
            add_if_valid(NameOID.ORGANIZATIONAL_UNIT_NAME)

    except Exception as e:
        print(f"    PKCS#7 parsing failed: {e}")

    # Method 2: Fallback - search for known institution names in binary data
    if not candidates:
        print(f"    No candidates from PKCS#7, trying binary search fallback...")

        known_institutions = [
            "广东产品质量监督检验研究院",
            "广东产品质量监督检验",
            "广东省产品质量监督检验研究院",
            "质量监督检验研究院",
        ]

        for inst in known_institutions:
            encoded = inst.encode('utf-8')
            if encoded in signature_bytes:
                if inst not in candidates:
                    candidates.append(inst)
                    print(f"    Found in binary data: {inst}")

        # Also try pattern matching
        try:
            decoded = signature_bytes.decode('utf-8', errors='ignore')
            patterns = [
                r'[\u4e00-\u9fff]{4,}(?:研究院|研究所|检测中心|检验院)',
                r'[\u4e00-\u9fff]{4,}(?:有限公司)',
            ]

            for pattern in patterns:
                matches = re.findall(pattern, decoded)
                for match in matches:
                    if len(match) >= 4 and match not in candidates:
                        candidates.append(match)
                        print(f"    Found pattern: {match}")

        except Exception as e:
            print(f"    Pattern matching failed: {e}")

    return candidates

def extract_institution_from_crt_improved(pdf_path: str) -> list:
    """改进的CRT提取函数"""
    try:
        pdf = pikepdf.Pdf.open(pdf_path)
    except Exception as e:
        print(f"Failed to open PDF: {e}")
        return []

    try:
        acroform = pdf.Root.get("/AcroForm")
        if not acroform:
            print("No /AcroForm found")
            return []

        fields = acroform.get("/Fields", [])
        all_candidates = []

        for idx, field in enumerate(fields):
            field_obj = field
            if field_obj.get("/FT") != "/Sig":
                continue

            sig_dict = field_obj.get("/V")
            if not sig_dict:
                continue

            contents_obj = sig_dict.get("/Contents")
            if contents_obj is None:
                continue

            contents = bytes(contents_obj)
            print(f"\n  Signature #{idx}:")
            print(f"    Size: {len(contents)} bytes")

            candidates = parse_certificates_improved(contents)
            for candidate in candidates:
                if candidate not in all_candidates:
                    all_candidates.append(candidate)

            if len(all_candidates) > 0 and idx >= 2:  # Found candidates and checked 3 signatures
                break

        return all_candidates

    except Exception as e:
        print(f"Error: {e}")
        import traceback
        traceback.print_exc()
        return []

def main():
    test_pdfs = [
        ("src/test/resources/data/pdfs/YDQ25_002294.pdf", "广东产品质量监督检验研究院"),
        ("src/test/resources/data/pdfs/YDQ23_001838.pdf", "广东产品质量监督检验研究院"),
    ]

    print("="*80)
    print("STANDALONE CRT EXTRACTION TEST")
    print("="*80)

    for pdf_path, expected in test_pdfs:
        print(f"\n{'#'*80}")
        print(f"Testing: {pdf_path}")
        print(f"Expected: {expected}")
        print(f"{'#'*80}")

        result = extract_institution_from_crt_improved(pdf_path)

        print(f"\nResult: {result}")

        if expected in result:
            print(f"✓✓✓ SUCCESS! Found expected institution")
        elif result:
            print(f"⚠ PARTIAL SUCCESS! Found institutions but not expected:")
            print(f"   Expected: {expected}")
            print(f"   Got: {result}")
        else:
            print(f"✗✗✗ FAILED! No institutions extracted")

    print("\n" + "="*80)

if __name__ == "__main__":
    main()
-												chore(project): conservative cleanup - archive temp scripts and old docs

Major cleanup to improve project organization and maintainability.

Changes:
- Moved 34 temp/debug/test scripts to archive/temp_scripts/
- Moved 9 auxiliary tools to archive/tools/
- Moved 3 CRT test scripts to archive/crt_tests/
- Moved 4 OCR test scripts to archive/ocr_tests/
- Moved 14 old documentation files to archive/docs/
- Deleted 4 useless files (duplicates, temp files)

Root directory:
- Before: 67 files (cluttered)
- After: 10 core files (clean and organized)

Core files retained:
- test_accuracy_batch_full.py (main script)
- cma_extraction_template_primary.py (CMA extraction)
- cma_extraction_final.py (backup CMA extraction)
- CLAUDE.md (project guide)
- TEST_ACCURACY_BATCH_README.md (usage guide)
- TEST_ACCURACY_BATCH_DEPENDENCIES.md (dependency docs)
- CLEANUP_PLAN.md (cleanup plan)
- CLEANUP_SUMMARY.md (this file)
- IMPLEMENTATION_SUMMARY.md (implementation summary)
- requirements.txt (dependencies)

Archive structure:
archive/
├── temp_scripts/ (34 files: test_, debug_, analyze_, etc.)
├── tools/ (9 files: find_, show_, visualize_, etc.)
├── crt_tests/ (3 files: CRT extraction tests)
├── ocr_tests/ (4 files: OCR timeout tests)
└── docs/ (14 files: old reports and guides)

Benefits:
✓ Cleaner root directory - easier navigation
✓ Better organization - clear separation of concerns
✓ Preserved history - all files archived, not deleted
✓ Improved maintainability - easier to find active files
✓ Better git history - removed 198 deleted files from tracking

No functional changes - all core functionality preserved.

Related:
- TEST_ACCURACY_BATCH_DEPENDENCIES.md - dependency analysis
- CLEANUP_PLAN.md - detailed cleanup plan

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

											
										
										
											2026-03-03 14:35:06 +08:00
+								"""
 								独立的CRT提取测试 - 不依赖大型模块
 								"""
 								import pikepdf
 								from cryptography.hazmat.primitives.serialization.pkcs7 import load_der_pkcs7_certificates
 								from cryptography.x509.oid import NameOID
 								import re
 								def _get_name_attr(name, oid: NameOID):
 								    """Extract attribute value from X.500 name by OID."""
 								    try:
 								        values = name.get_attributes_for_oid(oid)
 								    except ValueError:
 								        return None
 								    return values[0].value if values else None
 								def parse_certificates_improved(signature_bytes: bytes) -> list:
 								    """
 								    改进的证书解析函数，添加binary search fallback
 								    """
 								    candidates = []
 								    # Method 1: Try PKCS#7 parsing first
 								    try:
 								        certs = load_der_pkcs7_certificates(signature_bytes)
 								        # Usually first cert in bundle is signer's cert
 								        for cert in certs:
 								            # Collect potential organization names from CN, O, OU
 								            def add_if_valid(oid):
 								                val = _get_name_attr(cert.subject, oid)
 								                if val:
 								                    clean = val.strip()
 								                    if len(clean) >= 4 and clean not in candidates:
 								                        candidates.append(clean)
 								            add_if_valid(NameOID.COMMON_NAME)
 								            add_if_valid(NameOID.ORGANIZATION_NAME)
 								            add_if_valid(NameOID.ORGANIZATIONAL_UNIT_NAME)
 								    except Exception as e:
 								        print(f"    PKCS#7 parsing failed: {e}")
 								    # Method 2: Fallback - search for known institution names in binary data
 								    if not candidates:
 								        print(f"    No candidates from PKCS#7, trying binary search fallback...")
 								        known_institutions = [
 								            "广东产品质量监督检验研究院",
 								            "广东产品质量监督检验",
 								            "广东省产品质量监督检验研究院",
 								            "质量监督检验研究院",
 								        ]
 								        for inst in known_institutions:
 								            encoded = inst.encode('utf-8')
 								            if encoded in signature_bytes:
 								                if inst not in candidates:
 								                    candidates.append(inst)
 								                    print(f"    Found in binary data: {inst}")
 								        # Also try pattern matching
 								        try:
 								            decoded = signature_bytes.decode('utf-8', errors='ignore')
 								            patterns = [
 								                r'[\u4e00-\u9fff]{4,}(?:研究院|研究所|检测中心|检验院)',
 								                r'[\u4e00-\u9fff]{4,}(?:有限公司)',
 								            ]
 								            for pattern in patterns:
 								                matches = re.findall(pattern, decoded)
 								                for match in matches:
 								                    if len(match) >= 4 and match not in candidates:
 								                        candidates.append(match)
 								                        print(f"    Found pattern: {match}")
 								        except Exception as e:
 								            print(f"    Pattern matching failed: {e}")
 								    return candidates
 								def extract_institution_from_crt_improved(pdf_path: str) -> list:
 								    """改进的CRT提取函数"""
 								    try:
 								        pdf = pikepdf.Pdf.open(pdf_path)
 								    except Exception as e:
 								        print(f"Failed to open PDF: {e}")
 								        return []
 								    try:
 								        acroform = pdf.Root.get("/AcroForm")
 								        if not acroform:
 								            print("No /AcroForm found")
 								            return []
 								        fields = acroform.get("/Fields", [])
 								        all_candidates = []
 								        for idx, field in enumerate(fields):
 								            field_obj = field
 								            if field_obj.get("/FT") != "/Sig":
 								                continue
 								            sig_dict = field_obj.get("/V")
 								            if not sig_dict:
 								                continue
 								            contents_obj = sig_dict.get("/Contents")
 								            if contents_obj is None:
 								                continue
 								            contents = bytes(contents_obj)
 								            print(f"\n  Signature #{idx}:")
 								            print(f"    Size: {len(contents)} bytes")
 								            candidates = parse_certificates_improved(contents)
 								            for candidate in candidates:
 								                if candidate not in all_candidates:
 								                    all_candidates.append(candidate)
 								            if len(all_candidates) > 0 and idx >= 2:  # Found candidates and checked 3 signatures
 								                break
 								        return all_candidates
 								    except Exception as e:
 								        print(f"Error: {e}")
 								        import traceback
 								        traceback.print_exc()
 								        return []
 								def main():
 								    test_pdfs = [
 								        ("src/test/resources/data/pdfs/YDQ25_002294.pdf", "广东产品质量监督检验研究院"),
 								        ("src/test/resources/data/pdfs/YDQ23_001838.pdf", "广东产品质量监督检验研究院"),
 								    ]
 								    print("="*80)
 								    print("STANDALONE CRT EXTRACTION TEST")
 								    print("="*80)
 								    for pdf_path, expected in test_pdfs:
 								        print(f"\n{'#'*80}")
 								        print(f"Testing: {pdf_path}")
 								        print(f"Expected: {expected}")
 								        print(f"{'#'*80}")
 								        result = extract_institution_from_crt_improved(pdf_path)
 								        print(f"\nResult: {result}")
 								        if expected in result:
 								            print(f"✓✓✓ SUCCESS! Found expected institution")
 								        elif result:
 								            print(f"⚠ PARTIAL SUCCESS! Found institutions but not expected:")
 								            print(f"   Expected: {expected}")
 								            print(f"   Got: {result}")
 								        else:
 								            print(f"✗✗✗ FAILED! No institutions extracted")
 								    print("\n" + "="*80)
 								if __name__ == "__main__":
 								    main()