""" Run single test with detailed debug output for YDQ23_001838.pdf """ import sys import os # Clear ALL cache print("=" * 80) print("CLEARING CACHE") print("=" * 80) import shutil import subprocess # Clear Python cache try: result = subprocess.run(['find', '.', '-name', '__pycache__', '-type', 'd', '-exec', 'rm', '-rf', '{}', '+'], capture_output=True, shell=False) print(f"Cache cleared (exit code: {result.returncode})") except: print("Using alternative cache clear...") for root, dirs, files in os.walk("."): for d in dirs[:100]: # Limit to avoid timeout if d == "__pycache__": try: shutil.rmtree(os.path.join(root, d)) print(f" Removed: {os.path.join(root, d)}") except: pass # Clear module cache modules_to_clear = list(sys.modules.keys()) for module in modules_to_clear: if module.startswith('cma_extraction') or module.startswith('test_accuracy') or module.startswith('paddleocr'): del sys.modules[module] print(f"Cleared {len(modules_to_clear)} modules from memory") print("\n" + "=" * 80) print("IMPORTING MODULES") print("=" * 80) # Set environment os.environ["DISABLE_MODEL_SOURCE_CHECK"] = "True" # Import fresh from test_accuracy_batch_full import process_single_pdf from pathlib import Path import json from paddleocr import PaddleOCR print("Modules imported successfully\n") # Test configuration pdf_name = "YDQ23_001838.pdf" pdf_dir = Path("src/test/resources/data/pdfs") output_dir = Path("test_reports_debug") / pdf_name output_dir.mkdir(parents=True, exist_ok=True) # Load expected results results_file = Path("src/test/resources/data/results.json") with open(results_file, 'r', encoding='utf-8') as f: expected_results = json.load(f) expected_cma = expected_results.get(pdf_name, {}).get('cma') expected_inst = expected_results.get(pdf_name, {}).get('institution') print("=" * 80) print("TEST CONFIGURATION") print("=" * 80) print(f"PDF: {pdf_name}") print(f"Expected CMA: {expected_cma}") print(f"Expected Institution: {expected_inst}") print(f"Output: {output_dir}") print() # Initialize OCR print("Initializing PaddleOCR...") ocr_engine = PaddleOCR(lang='ch') print("OCR initialized\n") # Run test print("=" * 80) print("RUNNING TEST") print("=" * 80) result = process_single_pdf( pdf_name=pdf_name, expected_cma=expected_cma, expected_inst=expected_inst, pdf_dir=pdf_dir, output_dir=output_dir, ocr_engine=ocr_engine, ocr_model="ppocr_v5", vl_pipeline=None ) # Display results print("\n" + "=" * 80) print("TEST RESULTS") print("=" * 80) print(f"Expected CMA: {expected_cma}") print(f"Extracted CMA: {result['extracted'].get('cma', 'N/A')}") print(f"CMA Match: {result['comparison']['cma'].get('match_type', 'UNKNOWN')}") print(f"CMA Similarity: {result['comparison']['cma'].get('similarity', 0):.1f}%") print() print(f"Expected Institution: {expected_inst}") print(f"Extracted Institution: {result['extracted'].get('institution', 'N/A')}") print(f"Institution Match: {result['comparison']['institution'].get('match_type', 'UNKNOWN')}") print(f"Institution Similarity: {result['comparison']['institution'].get('similarity', 0):.1f}%") print() # Check result if result['extracted'].get('cma') == expected_cma: print("✓ CMA EXTRACTION SUCCESSFUL") sys.exit(0) else: print("✗ CMA EXTRACTION FAILED") print(f"\nExtracted: {result['extracted'].get('cma')}") print(f"Expected: {expected_cma}") print("\nCheck debug output in:", output_dir) sys.exit(1)