report-detect/archive/temp_scripts/test_crt_extraction.py

45 lines
1.2 KiB
Python

"""
Test CRT extraction for YDQ25_002294.pdf
"""
import sys
import os
from pathlib import Path
os.environ["DISABLE_MODEL_SOURCE_CHECK"] = "True"
# Import CRT extraction function
sys.path.insert(0, os.path.dirname(__file__))
from test_accuracy_batch_full import extract_institution_from_crt
# Test PDF
pdf_path = Path("src/test/resources/data/pdfs/YDQ25_002294.pdf")
print(f"Testing CRT extraction for: {pdf_path}")
print("=" * 80)
# Check if file exists
if not pdf_path.exists():
print(f"ERROR: PDF not found: {pdf_path}")
sys.exit(1)
# Extract institutions from CRT
institutions = extract_institution_from_crt(str(pdf_path))
print("\n" + "=" * 80)
print("RESULTS")
print("=" * 80)
print(f"Institutions found: {len(institutions)}")
for idx, inst in enumerate(institutions, 1):
print(f" {idx}. {inst}")
if institutions:
print(f"\n✓ CRT extraction SUCCESS: {institutions[0]}")
else:
print("\n✗ CRT extraction FAILED: No institutions found")
print("\nPossible reasons:")
print(" 1. PDF has no digital signatures (scanned PDF)")
print(" 2. PDF signatures are not accessible (locked/encrypted)")
print(" 3. Certificate parsing failed")
print("=" * 80)