"""
Run fresh test with cleared cache
"""
import sys
import os

# Clear all Python cache
print("Clearing Python cache...")
import shutil
for root, dirs, files in os.walk("."):
    for d in dirs:
        if d == "__pycache__":
            cache_path = os.path.join(root, d)
            try:
                shutil.rmtree(cache_path)
                print(f"  Removed: {cache_path}")
            except:
                pass

# Clear module cache
print("Clearing module cache...")
modules_to_clear = [m for m in sys.modules.keys() if m.startswith('cma_extraction') or m.startswith('test_accuracy')]
for module in modules_to_clear:
    del sys.modules[module]
print(f"  Cleared {len(modules_to_clear)} modules")

# Run test
print("\nRunning test for YDQ23_001838.pdf...")
print("=" * 80)

from test_accuracy_batch_full import process_single_pdf
from pathlib import Path

pdf_name = "YDQ23_001838.pdf"
pdf_dir = Path("src/test/resources/data/pdfs")
output_dir = Path("test_reports_fresh")

# Load expected results
import json
results_file = Path("src/test/resources/data/results.json")
with open(results_file, 'r', encoding='utf-8') as f:
    expected_results = json.load(f)

expected_cma = expected_results.get(pdf_name, {}).get('cma')
expected_inst = expected_results.get(pdf_name, {}).get('institution')

# Initialize OCR
os.environ["DISABLE_MODEL_SOURCE_CHECK"] = "True"
from paddleocr import PaddleOCR
ocr_engine = PaddleOCR(lang='ch')

# Process
result = process_single_pdf(
    pdf_name=pdf_name,
    expected_cma=expected_cma,
    expected_inst=expected_inst,
    pdf_dir=pdf_dir,
    output_dir=output_dir / pdf_name,
    ocr_engine=ocr_engine,
    ocr_model="ppocr_v5",
    vl_pipeline=None
)

print("\n" + "=" * 80)
print("TEST RESULT")
print("=" * 80)
print(f"Expected CMA: {expected_cma}")
print(f"Extracted CMA: {result['extracted']['cma']}")
print(f"Match: {result['comparison']['cma'].get('match_type', 'UNKNOWN')}")
print(f"Similarity: {result['comparison']['cma'].get('similarity', 0):.1f}%")