report-detect/archive/temp_scripts/run_single_test.py

121 lines
3.5 KiB
Python

"""
Run single test with detailed debug output for YDQ23_001838.pdf
"""
import sys
import os
# Clear ALL cache
print("=" * 80)
print("CLEARING CACHE")
print("=" * 80)
import shutil
import subprocess
# Clear Python cache
try:
result = subprocess.run(['find', '.', '-name', '__pycache__', '-type', 'd', '-exec', 'rm', '-rf', '{}', '+'],
capture_output=True, shell=False)
print(f"Cache cleared (exit code: {result.returncode})")
except:
print("Using alternative cache clear...")
for root, dirs, files in os.walk("."):
for d in dirs[:100]: # Limit to avoid timeout
if d == "__pycache__":
try:
shutil.rmtree(os.path.join(root, d))
print(f" Removed: {os.path.join(root, d)}")
except:
pass
# Clear module cache
modules_to_clear = list(sys.modules.keys())
for module in modules_to_clear:
if module.startswith('cma_extraction') or module.startswith('test_accuracy') or module.startswith('paddleocr'):
del sys.modules[module]
print(f"Cleared {len(modules_to_clear)} modules from memory")
print("\n" + "=" * 80)
print("IMPORTING MODULES")
print("=" * 80)
# Set environment
os.environ["DISABLE_MODEL_SOURCE_CHECK"] = "True"
# Import fresh
from test_accuracy_batch_full import process_single_pdf
from pathlib import Path
import json
from paddleocr import PaddleOCR
print("Modules imported successfully\n")
# Test configuration
pdf_name = "YDQ23_001838.pdf"
pdf_dir = Path("src/test/resources/data/pdfs")
output_dir = Path("test_reports_debug") / pdf_name
output_dir.mkdir(parents=True, exist_ok=True)
# Load expected results
results_file = Path("src/test/resources/data/results.json")
with open(results_file, 'r', encoding='utf-8') as f:
expected_results = json.load(f)
expected_cma = expected_results.get(pdf_name, {}).get('cma')
expected_inst = expected_results.get(pdf_name, {}).get('institution')
print("=" * 80)
print("TEST CONFIGURATION")
print("=" * 80)
print(f"PDF: {pdf_name}")
print(f"Expected CMA: {expected_cma}")
print(f"Expected Institution: {expected_inst}")
print(f"Output: {output_dir}")
print()
# Initialize OCR
print("Initializing PaddleOCR...")
ocr_engine = PaddleOCR(lang='ch')
print("OCR initialized\n")
# Run test
print("=" * 80)
print("RUNNING TEST")
print("=" * 80)
result = process_single_pdf(
pdf_name=pdf_name,
expected_cma=expected_cma,
expected_inst=expected_inst,
pdf_dir=pdf_dir,
output_dir=output_dir,
ocr_engine=ocr_engine,
ocr_model="ppocr_v5",
vl_pipeline=None
)
# Display results
print("\n" + "=" * 80)
print("TEST RESULTS")
print("=" * 80)
print(f"Expected CMA: {expected_cma}")
print(f"Extracted CMA: {result['extracted'].get('cma', 'N/A')}")
print(f"CMA Match: {result['comparison']['cma'].get('match_type', 'UNKNOWN')}")
print(f"CMA Similarity: {result['comparison']['cma'].get('similarity', 0):.1f}%")
print()
print(f"Expected Institution: {expected_inst}")
print(f"Extracted Institution: {result['extracted'].get('institution', 'N/A')}")
print(f"Institution Match: {result['comparison']['institution'].get('match_type', 'UNKNOWN')}")
print(f"Institution Similarity: {result['comparison']['institution'].get('similarity', 0):.1f}%")
print()
# Check result
if result['extracted'].get('cma') == expected_cma:
print("✓ CMA EXTRACTION SUCCESSFUL")
sys.exit(0)
else:
print("✗ CMA EXTRACTION FAILED")
print(f"\nExtracted: {result['extracted'].get('cma')}")
print(f"Expected: {expected_cma}")
print("\nCheck debug output in:", output_dir)
sys.exit(1)