71 lines
2.0 KiB
Python
71 lines
2.0 KiB
Python
"""
|
|
Run fresh test with cleared cache
|
|
"""
|
|
import sys
|
|
import os
|
|
|
|
# Clear all Python cache
|
|
print("Clearing Python cache...")
|
|
import shutil
|
|
for root, dirs, files in os.walk("."):
|
|
for d in dirs:
|
|
if d == "__pycache__":
|
|
cache_path = os.path.join(root, d)
|
|
try:
|
|
shutil.rmtree(cache_path)
|
|
print(f" Removed: {cache_path}")
|
|
except:
|
|
pass
|
|
|
|
# Clear module cache
|
|
print("Clearing module cache...")
|
|
modules_to_clear = [m for m in sys.modules.keys() if m.startswith('cma_extraction') or m.startswith('test_accuracy')]
|
|
for module in modules_to_clear:
|
|
del sys.modules[module]
|
|
print(f" Cleared {len(modules_to_clear)} modules")
|
|
|
|
# Run test
|
|
print("\nRunning test for YDQ23_001838.pdf...")
|
|
print("=" * 80)
|
|
|
|
from test_accuracy_batch_full import process_single_pdf
|
|
from pathlib import Path
|
|
|
|
pdf_name = "YDQ23_001838.pdf"
|
|
pdf_dir = Path("src/test/resources/data/pdfs")
|
|
output_dir = Path("test_reports_fresh")
|
|
|
|
# Load expected results
|
|
import json
|
|
results_file = Path("src/test/resources/data/results.json")
|
|
with open(results_file, 'r', encoding='utf-8') as f:
|
|
expected_results = json.load(f)
|
|
|
|
expected_cma = expected_results.get(pdf_name, {}).get('cma')
|
|
expected_inst = expected_results.get(pdf_name, {}).get('institution')
|
|
|
|
# Initialize OCR
|
|
os.environ["DISABLE_MODEL_SOURCE_CHECK"] = "True"
|
|
from paddleocr import PaddleOCR
|
|
ocr_engine = PaddleOCR(lang='ch')
|
|
|
|
# Process
|
|
result = process_single_pdf(
|
|
pdf_name=pdf_name,
|
|
expected_cma=expected_cma,
|
|
expected_inst=expected_inst,
|
|
pdf_dir=pdf_dir,
|
|
output_dir=output_dir / pdf_name,
|
|
ocr_engine=ocr_engine,
|
|
ocr_model="ppocr_v5",
|
|
vl_pipeline=None
|
|
)
|
|
|
|
print("\n" + "=" * 80)
|
|
print("TEST RESULT")
|
|
print("=" * 80)
|
|
print(f"Expected CMA: {expected_cma}")
|
|
print(f"Extracted CMA: {result['extracted']['cma']}")
|
|
print(f"Match: {result['comparison']['cma'].get('match_type', 'UNKNOWN')}")
|
|
print(f"Similarity: {result['comparison']['cma'].get('similarity', 0):.1f}%")
|