""" Run fresh test with cleared cache """ import sys import os # Clear all Python cache print("Clearing Python cache...") import shutil for root, dirs, files in os.walk("."): for d in dirs: if d == "__pycache__": cache_path = os.path.join(root, d) try: shutil.rmtree(cache_path) print(f" Removed: {cache_path}") except: pass # Clear module cache print("Clearing module cache...") modules_to_clear = [m for m in sys.modules.keys() if m.startswith('cma_extraction') or m.startswith('test_accuracy')] for module in modules_to_clear: del sys.modules[module] print(f" Cleared {len(modules_to_clear)} modules") # Run test print("\nRunning test for YDQ23_001838.pdf...") print("=" * 80) from test_accuracy_batch_full import process_single_pdf from pathlib import Path pdf_name = "YDQ23_001838.pdf" pdf_dir = Path("src/test/resources/data/pdfs") output_dir = Path("test_reports_fresh") # Load expected results import json results_file = Path("src/test/resources/data/results.json") with open(results_file, 'r', encoding='utf-8') as f: expected_results = json.load(f) expected_cma = expected_results.get(pdf_name, {}).get('cma') expected_inst = expected_results.get(pdf_name, {}).get('institution') # Initialize OCR os.environ["DISABLE_MODEL_SOURCE_CHECK"] = "True" from paddleocr import PaddleOCR ocr_engine = PaddleOCR(lang='ch') # Process result = process_single_pdf( pdf_name=pdf_name, expected_cma=expected_cma, expected_inst=expected_inst, pdf_dir=pdf_dir, output_dir=output_dir / pdf_name, ocr_engine=ocr_engine, ocr_model="ppocr_v5", vl_pipeline=None ) print("\n" + "=" * 80) print("TEST RESULT") print("=" * 80) print(f"Expected CMA: {expected_cma}") print(f"Extracted CMA: {result['extracted']['cma']}") print(f"Match: {result['comparison']['cma'].get('match_type', 'UNKNOWN')}") print(f"Similarity: {result['comparison']['cma'].get('similarity', 0):.1f}%")